diff options
author | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2022-06-15 21:39:32 +0000 |
---|---|---|
committer | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2022-06-15 21:39:32 +0000 |
commit | 854cabe58fe83993ab608b428c6a97c5565dcb0c (patch) | |
tree | c0a00b9b4d52ff3dfeb50f5d894bad2d71389b00 | |
parent | 9c6c6103b62bb8941c2bd711f0e6cb47b6f10b2e (diff) | |
parent | 98f9e8aacdf9898e4ff093385365a233d25bf24f (diff) | |
download | icing-aml_tz3_314012010.tar.gz |
Snap for 8730993 from 98f9e8aacdf9898e4ff093385365a233d25bf24f to mainline-tzdata3-releaseaml_tz3_314012070aml_tz3_314012050aml_tz3_314012010aml_tz3_313110000aml_tz3_312511020aml_tz3_312511010aml_tz3_312410020aml_tz3_312410010android12-mainline-tzdata3-releaseaml_tz3_314012010
Change-Id: I3aff676785fcc7c7da09269c5fb50e4461fbdea1
169 files changed, 3670 insertions, 10188 deletions
@@ -82,13 +82,14 @@ cc_library_shared { "libutf", ], shared_libs: [ - "libicu", + "libandroidicu", "liblog", - "libprotobuf-cpp-lite", + // TODO(b/147509515): We only need the full version for GzipStream. If we can remove + // that dependency, then we can just use libprotobuf-cpp-lite + "libprotobuf-cpp-full", "libz", ], version_script: "icing/jni.lds", - min_sdk_version: "Tiramisu", } // TODO(cassiewang): Add build rules and a TEST_MAPPING for cc_tests diff --git a/CMakeLists.txt b/CMakeLists.txt index 8c8e439..01ee8eb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,8 +14,6 @@ cmake_minimum_required(VERSION 3.10.2) -project(icing) - add_definitions("-DICING_REVERSE_JNI_SEGMENTATION=1") set(VERSION_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/icing/jni.lds") set(CMAKE_SHARED_LINKER_FLAGS @@ -76,7 +74,7 @@ foreach(FILE ${Icing_PROTO_FILES}) "${Icing_PROTO_GEN_DIR}/${FILE_NOEXT}.pb.h" COMMAND ${Protobuf_PROTOC_PATH} --proto_path "${CMAKE_CURRENT_SOURCE_DIR}/proto" - --cpp_out "lite:${Icing_PROTO_GEN_DIR}" + --cpp_out ${Icing_PROTO_GEN_DIR} ${FILE} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/proto/${FILE} @@ -129,4 +127,4 @@ target_include_directories(icing PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) target_include_directories(icing PRIVATE ${Icing_PROTO_GEN_DIR}) target_include_directories(icing PRIVATE "${Protobuf_SOURCE_DIR}/src") target_include_directories(icing PRIVATE "${ICU_SOURCE_DIR}/include") -target_link_libraries(icing protobuf::libprotobuf-lite libandroidicu log z) +target_link_libraries(icing protobuf::libprotobuf libandroidicu log) @@ -1,3 +0,0 @@ -adorokhine@google.com -tjbarron@google.com -dsaadati@google.com diff --git a/TEST_MAPPING b/TEST_MAPPING index baef43b..37cb5fc 100644 --- a/TEST_MAPPING +++ b/TEST_MAPPING @@ -4,14 +4,9 @@ "name": "IcingSearchEngineTest" } ], - "hwasan-postsubmit": [ - { - "name": "IcingSearchEngineTest" - } - ], "imports": [ { - "path": "packages/modules/AppSearch" + "path": "frameworks/base/apex/appsearch/service/java/com/android/server/appsearch" } ] } diff --git a/build.gradle b/build.gradle index 5b5f3a6..882a929 100644 --- a/build.gradle +++ b/build.gradle @@ -15,6 +15,7 @@ */ import static androidx.build.SupportConfig.* +import static androidx.build.dependencies.DependenciesKt.* buildscript { dependencies { @@ -56,14 +57,14 @@ dependencies { implementation('com.google.protobuf:protobuf-javalite:3.10.0') - androidTestImplementation(libs.testCore) - androidTestImplementation(libs.testRules) - androidTestImplementation(libs.truth) + androidTestImplementation(ANDROIDX_TEST_CORE) + androidTestImplementation(ANDROIDX_TEST_RULES) + androidTestImplementation(TRUTH) } protobuf { protoc { - artifact = libs.protobufCompiler.get() + artifact = 'com.google.protobuf:protoc:3.10.0' } generateProtoTasks { @@ -92,7 +93,7 @@ android.libraryVariants.all { variant -> // only renames the java classes. Remove them here since they are unused. // Expand the jar and remove any .proto files. from(zipTree(configurations.detachedConfiguration( - dependencies.create(libs.protobufLite.get())).getSingleFile())) { + dependencies.create(PROTOBUF_LITE)).getSingleFile())) { exclude("**/*.proto") } diff --git a/icing/file/file-backed-bitmap.cc b/icing/file/file-backed-bitmap.cc index eec7668..f1e568c 100644 --- a/icing/file/file-backed-bitmap.cc +++ b/icing/file/file-backed-bitmap.cc @@ -50,7 +50,7 @@ FileBackedBitmap::Create(const Filesystem* filesystem, auto bitmap = std::unique_ptr<FileBackedBitmap>( new FileBackedBitmap(filesystem, file_path, mmap_strategy)); - // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = bitmap->Initialize(); if (!status.ok()) { @@ -122,7 +122,7 @@ libtextclassifier3::Status FileBackedBitmap::FileBackedBitmap::Initialize() { << " of size: " << file_size; } - // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = mmapper_->Remap(0, file_size); if (!status.ok()) { @@ -198,7 +198,7 @@ int FileBackedBitmap::NumBits() const { libtextclassifier3::Status FileBackedBitmap::Set(int bit_index, bool bit_value) { if (bit_index >= NumBits()) { - // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = GrowTo(bit_index); if (!status.ok()) { @@ -261,7 +261,7 @@ libtextclassifier3::Status FileBackedBitmap::GrowTo(int new_num_bits) { file_path_.c_str(), new_file_size)); } - // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = mmapper_->Remap(0, new_file_size); if (!status.ok()) { @@ -281,7 +281,7 @@ libtextclassifier3::Status FileBackedBitmap::TruncateTo(int new_num_bits) { } const size_t new_file_size = FileSizeForBits(new_num_bits); - // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = mmapper_->Remap(0, new_file_size); if (!status.ok()) { diff --git a/icing/file/file-backed-proto-log.h b/icing/file/file-backed-proto-log.h index 686b4fb..b2b37e8 100644 --- a/icing/file/file-backed-proto-log.h +++ b/icing/file/file-backed-proto-log.h @@ -14,14 +14,16 @@ // File-backed log of protos with append-only writes and position based reads. // -// The implementation in this file is deprecated and replaced by -// portable-file-backed-proto-log.h. +// There should only be one instance of a FileBackedProtoLog of the same file at +// a time; using multiple instances at the same time may lead to undefined +// behavior. // -// This deprecated implementation has been made read-only for the purposes of -// migration; writing and erasing this format of log is no longer supported and -// the methods to accomplish this have been removed. +// The entire checksum is computed on initialization to verify the contents are +// valid. On failure, the log will be truncated to the last verified state when +// PersistToDisk() was called. If the log cannot successfully restore the last +// state due to disk corruption or some other inconsistency, then the entire log +// will be lost. // -// The details of this format follow below: // Each proto written to the file will have a metadata written just before it. // The metadata consists of // { @@ -29,24 +31,45 @@ // 3 bytes of the proto size // n bytes of the proto itself // } +// +// Example usage: +// ICING_ASSERT_OK_AND_ASSIGN(auto create_result, +// FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path_, +// options)); +// auto proto_log = create_result.proto_log; +// +// Document document; +// document.set_namespace("com.google.android.example"); +// document.set_uri("www.google.com"); +// +// int64_t document_offset = proto_log->WriteProto(document)); +// Document same_document = proto_log->ReadProto(document_offset)); +// proto_log->PersistToDisk(); +// // TODO(b/136514769): Add versioning to the header and a UpgradeToVersion // migration method. + #ifndef ICING_FILE_FILE_BACKED_PROTO_LOG_H_ #define ICING_FILE_FILE_BACKED_PROTO_LOG_H_ +#include <cstddef> #include <cstdint> +#include <cstring> #include <memory> #include <string> #include <string_view> +#include <utility> +#include <vector> +#include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" +#include <google/protobuf/io/gzip_stream.h> #include <google/protobuf/io/zero_copy_stream_impl_lite.h> #include "icing/absl_ports/canonical_errors.h" #include "icing/absl_ports/str_cat.h" #include "icing/file/filesystem.h" #include "icing/file/memory-mapped-file.h" #include "icing/legacy/core/icing-string-util.h" -#include "icing/portable/gzip_stream.h" #include "icing/portable/platform.h" #include "icing/portable/zlib.h" #include "icing/util/crc32.h" @@ -89,6 +112,10 @@ class FileBackedProtoLog { // Header stored at the beginning of the file before the rest of the log // contents. Stores metadata on the log. + // + // TODO(b/139375388): Migrate the Header struct to a proto. This makes + // migrations easier since we don't need to worry about different size padding + // (which would affect the checksum) and different endians. struct Header { static constexpr int32_t kMagic = 0xf4c6f67a; @@ -168,6 +195,20 @@ class FileBackedProtoLog { FileBackedProtoLog(const FileBackedProtoLog&) = delete; FileBackedProtoLog& operator=(const FileBackedProtoLog&) = delete; + // This will update the checksum of the log as well. + ~FileBackedProtoLog(); + + // Writes the serialized proto to the underlying file. Writes are applied + // directly to the underlying file. Users do not need to sync the file after + // writing. + // + // Returns: + // Offset of the newly appended proto in file on success + // INVALID_ARGUMENT if proto is too large, as decided by + // Options.max_proto_size + // INTERNAL_ERROR on IO error + libtextclassifier3::StatusOr<int64_t> WriteProto(const ProtoT& proto); + // Reads out a proto located at file_offset from the file. // // Returns: @@ -177,6 +218,31 @@ class FileBackedProtoLog { // INTERNAL_ERROR on IO error libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const; + // Erases the data of a proto located at file_offset from the file. + // + // Returns: + // OK on success + // OUT_OF_RANGE_ERROR if file_offset exceeds file size + // INTERNAL_ERROR on IO error + libtextclassifier3::Status EraseProto(int64_t file_offset); + + // Calculates and returns the disk usage in bytes. Rounds up to the nearest + // block size. + // + // Returns: + // Disk usage on success + // INTERNAL_ERROR on IO error + libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const; + + // Returns the file size of all the elements held in the log. File size is in + // bytes. This excludes the size of any internal metadata of the log, e.g. the + // log's header. + // + // Returns: + // File size on success + // INTERNAL_ERROR on IO error + libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const; + // An iterator helping to find offsets of all the protos in file. // Example usage: // @@ -215,6 +281,72 @@ class FileBackedProtoLog { // behaviors could happen. Iterator GetIterator(); + // Persists all changes since initialization or the last call to + // PersistToDisk(). Any changes that aren't persisted may be lost if the + // system fails to close safely. + // + // Example use case: + // + // Document document; + // document.set_namespace("com.google.android.example"); + // document.set_uri("www.google.com"); + // + // { + // ICING_ASSERT_OK_AND_ASSIGN(auto create_result, + // FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path, + // options)); + // auto proto_log = std::move(create_result.proto_log); + // + // int64_t document_offset = proto_log->WriteProto(document)); + // + // // We lose the document here since it wasn't persisted. + // // *SYSTEM CRASH* + // } + // + // { + // // Can still successfully create after a crash since the log can + // // rewind/truncate to recover into a previously good state + // ICING_ASSERT_OK_AND_ASSIGN(auto create_result, + // FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path, + // options)); + // auto proto_log = std::move(create_result.proto_log); + // + // // Lost the proto since we didn't PersistToDisk before the crash + // proto_log->ReadProto(document_offset)); // INVALID_ARGUMENT error + // + // int64_t document_offset = proto_log->WriteProto(document)); + // + // // Persisted this time, so we should be ok. + // ICING_ASSERT_OK(proto_log->PersistToDisk()); + // } + // + // { + // ICING_ASSERT_OK_AND_ASSIGN(auto create_result, + // FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path, + // options)); + // auto proto_log = std::move(create_result.proto_log); + // + // // SUCCESS + // Document same_document = proto_log->ReadProto(document_offset)); + // } + // + // NOTE: Since all protos are already written to the file directly, this + // just updates the checksum and rewind position. Without these updates, + // future initializations will truncate the file and discard unpersisted + // changes. + // + // Returns: + // OK on success + // INTERNAL_ERROR on IO error + libtextclassifier3::Status PersistToDisk(); + + // Calculates the checksum of the log contents. Excludes the header content. + // + // Returns: + // Crc of the log content + // INTERNAL_ERROR on IO error + libtextclassifier3::StatusOr<Crc32> ComputeChecksum(); + private: // Object can only be instantiated via the ::Create factory. FileBackedProtoLog(const Filesystem* filesystem, const std::string& file_path, @@ -292,6 +424,9 @@ class FileBackedProtoLog { static_assert(kMaxProtoSize <= 0x00FFFFFF, "kMaxProtoSize doesn't fit in 3 bytes"); + // Level of compression, BEST_SPEED = 1, BEST_COMPRESSION = 9 + static constexpr int kDeflateCompressionLevel = 3; + // Chunks of the file to mmap at a time, so we don't mmap the entire file. // Only used on 32-bit devices static constexpr int kMmapChunkSize = 4 * 1024 * 1024; // 4MiB @@ -303,6 +438,9 @@ class FileBackedProtoLog { }; template <typename ProtoT> +constexpr uint8_t FileBackedProtoLog<ProtoT>::kProtoMagic; + +template <typename ProtoT> FileBackedProtoLog<ProtoT>::FileBackedProtoLog(const Filesystem* filesystem, const std::string& file_path, std::unique_ptr<Header> header) @@ -313,6 +451,15 @@ FileBackedProtoLog<ProtoT>::FileBackedProtoLog(const Filesystem* filesystem, } template <typename ProtoT> +FileBackedProtoLog<ProtoT>::~FileBackedProtoLog() { + if (!PersistToDisk().ok()) { + ICING_LOG(WARNING) + << "Error persisting to disk during destruction of FileBackedProtoLog: " + << file_path_; + } +} + +template <typename ProtoT> libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::CreateResult> FileBackedProtoLog<ProtoT>::Create(const Filesystem* filesystem, const std::string& file_path, @@ -541,6 +688,79 @@ libtextclassifier3::StatusOr<Crc32> FileBackedProtoLog<ProtoT>::ComputeChecksum( } template <typename ProtoT> +libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::WriteProto( + const ProtoT& proto) { + int64_t proto_size = proto.ByteSizeLong(); + int32_t metadata; + int metadata_size = sizeof(metadata); + int64_t current_position = filesystem_->GetCurrentPosition(fd_.get()); + + if (proto_size > header_->max_proto_size) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "proto_size, %lld, was too large to write. Max is %d", + static_cast<long long>(proto_size), header_->max_proto_size)); + } + + // At this point, we've guaranteed that proto_size is under kMaxProtoSize + // (see + // ::Create), so we can safely store it in an int. + int final_size = 0; + + std::string proto_str; + google::protobuf::io::StringOutputStream proto_stream(&proto_str); + + if (header_->compress) { + google::protobuf::io::GzipOutputStream::Options options; + options.format = google::protobuf::io::GzipOutputStream::ZLIB; + options.compression_level = kDeflateCompressionLevel; + + google::protobuf::io::GzipOutputStream compressing_stream(&proto_stream, + options); + + bool success = proto.SerializeToZeroCopyStream(&compressing_stream) && + compressing_stream.Close(); + + if (!success) { + return absl_ports::InternalError("Error compressing proto."); + } + + final_size = proto_str.size(); + + // In case the compressed proto is larger than the original proto, we also + // can't write it. + if (final_size > header_->max_proto_size) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Compressed proto size, %d, was greater than " + "max_proto_size, %d", + final_size, header_->max_proto_size)); + } + } else { + // Serialize the proto directly into the write buffer at an offset of the + // metadata. + proto.SerializeToZeroCopyStream(&proto_stream); + final_size = proto_str.size(); + } + + // 1st byte for magic, next 3 bytes for proto size. + metadata = (kProtoMagic << 24) | final_size; + + // Actually write metadata, has to be done after we know the possibly + // compressed proto size + if (!filesystem_->Write(fd_.get(), &metadata, metadata_size)) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to write proto metadata to: ", file_path_)); + } + + // Write the serialized proto + if (!filesystem_->Write(fd_.get(), proto_str.data(), proto_str.size())) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to write proto to: ", file_path_)); + } + + return current_position; +} + +template <typename ProtoT> libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto( int64_t file_offset) const { int64_t file_size = filesystem_->GetFileSize(fd_.get()); @@ -576,7 +796,7 @@ libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto( // Deserialize proto ProtoT proto; if (header_->compress) { - protobuf_ports::GzipInputStream decompress_stream(&proto_stream); + google::protobuf::io::GzipInputStream decompress_stream(&proto_stream); proto.ParseFromZeroCopyStream(&decompress_stream); } else { proto.ParseFromZeroCopyStream(&proto_stream); @@ -586,6 +806,83 @@ libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto( } template <typename ProtoT> +libtextclassifier3::Status FileBackedProtoLog<ProtoT>::EraseProto( + int64_t file_offset) { + int64_t file_size = filesystem_->GetFileSize(fd_.get()); + if (file_offset >= file_size) { + // file_size points to the next byte to write at, so subtract one to get + // the inclusive, actual size of file. + return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf( + "Trying to erase data at a location, %lld, " + "out of range of the file size, %lld", + static_cast<long long>(file_offset), + static_cast<long long>(file_size - 1))); + } + + MemoryMappedFile mmapped_file( + *filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC); + + // Read out the metadata + ICING_ASSIGN_OR_RETURN( + int metadata, ReadProtoMetadata(&mmapped_file, file_offset, file_size)); + + ICING_RETURN_IF_ERROR(mmapped_file.Remap(file_offset + sizeof(metadata), + GetProtoSize(metadata))); + + // We need to update the crc checksum if the erased area is before the + // rewind position. + if (file_offset + sizeof(metadata) < header_->rewind_offset) { + // We need to calculate [original string xor 0s]. + // The xored string is the same as the original string because 0 xor 0 = + // 0, 1 xor 0 = 1. + const std::string_view xored_str(mmapped_file.region(), + mmapped_file.region_size()); + + Crc32 crc(header_->log_checksum); + ICING_ASSIGN_OR_RETURN( + uint32_t new_crc, + crc.UpdateWithXor( + xored_str, + /*full_data_size=*/header_->rewind_offset - sizeof(Header), + /*position=*/file_offset + sizeof(metadata) - sizeof(Header))); + + header_->log_checksum = new_crc; + header_->header_checksum = header_->CalculateHeaderChecksum(); + + if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(), + sizeof(Header))) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to update header to: ", file_path_)); + } + } + + memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size()); + return libtextclassifier3::Status::OK; +} + +template <typename ProtoT> +libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::GetDiskUsage() + const { + int64_t size = filesystem_->GetDiskUsage(file_path_.c_str()); + if (size == Filesystem::kBadFileSize) { + return absl_ports::InternalError("Failed to get disk usage of proto log"); + } + return size; +} + +template <typename ProtoT> +libtextclassifier3::StatusOr<int64_t> +FileBackedProtoLog<ProtoT>::GetElementsFileSize() const { + int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str()); + if (total_file_size == Filesystem::kBadFileSize) { + return absl_ports::InternalError( + "Failed to get file size of elments in the proto log"); + } + return total_file_size - sizeof(Header); +} + +template <typename ProtoT> FileBackedProtoLog<ProtoT>::Iterator::Iterator(const Filesystem& filesystem, const std::string& file_path, int64_t initial_offset) @@ -667,6 +964,51 @@ libtextclassifier3::StatusOr<int> FileBackedProtoLog<ProtoT>::ReadProtoMetadata( return metadata; } +template <typename ProtoT> +libtextclassifier3::Status FileBackedProtoLog<ProtoT>::PersistToDisk() { + int64_t file_size = filesystem_->GetFileSize(file_path_.c_str()); + if (file_size == header_->rewind_offset) { + // No new protos appended, don't need to update the checksum. + return libtextclassifier3::Status::OK; + } + + int64_t new_content_size = file_size - header_->rewind_offset; + Crc32 crc; + if (new_content_size < 0) { + // File shrunk, recalculate the entire checksum. + ICING_ASSIGN_OR_RETURN( + crc, ComputeChecksum(filesystem_, file_path_, Crc32(), sizeof(Header), + file_size)); + } else { + // Append new changes to the existing checksum. + ICING_ASSIGN_OR_RETURN( + crc, + ComputeChecksum(filesystem_, file_path_, Crc32(header_->log_checksum), + header_->rewind_offset, file_size)); + } + + header_->log_checksum = crc.Get(); + header_->rewind_offset = file_size; + header_->header_checksum = header_->CalculateHeaderChecksum(); + + if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(), + sizeof(Header)) || + !filesystem_->DataSync(fd_.get())) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to update header to: ", file_path_)); + } + + return libtextclassifier3::Status::OK; +} + +template <typename ProtoT> +libtextclassifier3::StatusOr<Crc32> +FileBackedProtoLog<ProtoT>::ComputeChecksum() { + return FileBackedProtoLog<ProtoT>::ComputeChecksum( + filesystem_, file_path_, Crc32(), /*start=*/sizeof(Header), + /*end=*/filesystem_->GetFileSize(file_path_.c_str())); +} + } // namespace lib } // namespace icing diff --git a/icing/file/file-backed-proto-log_benchmark.cc b/icing/file/file-backed-proto-log_benchmark.cc new file mode 100644 index 0000000..c09fd5a --- /dev/null +++ b/icing/file/file-backed-proto-log_benchmark.cc @@ -0,0 +1,251 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <cstdint> +#include <random> + +#include "testing/base/public/benchmark.h" +#include "gmock/gmock.h" +#include "icing/document-builder.h" +#include "icing/file/file-backed-proto-log.h" +#include "icing/file/filesystem.h" +#include "icing/legacy/core/icing-string-util.h" +#include "icing/proto/document.pb.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/random-string.h" +#include "icing/testing/tmp-directory.h" + +// go/microbenchmarks +// +// To build and run on a local machine: +// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt +// icing/file:file-backed-proto-log_benchmark +// +// $ blaze-bin/icing/file/file-backed-proto-log_benchmark +// --benchmarks=all +// +// +// To build and run on an Android device (must be connected and rooted): +// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1" +// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt +// icing/file:file-backed-proto-log_benchmark +// +// $ adb root +// +// $ adb push +// blaze-bin/icing/file/file-backed-proto-log_benchmark +// /data/local/tmp/ +// +// $ adb shell /data/local/tmp/file-backed-proto-log-benchmark +// --benchmarks=all + +namespace icing { +namespace lib { + +namespace { + +static void BM_Write(benchmark::State& state) { + const Filesystem filesystem; + int string_length = state.range(0); + const std::string file_path = IcingStringUtil::StringPrintf( + "%s%s%d%s", GetTestTempDir().c_str(), "/proto_", string_length, ".log"); + int max_proto_size = (1 << 24) - 1; // 16 MiB + bool compress = true; + + // Make sure it doesn't already exist. + filesystem.DeleteFile(file_path.c_str()); + + auto proto_log = + FileBackedProtoLog<DocumentProto>::Create( + &filesystem, file_path, + FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size)) + .ValueOrDie() + .proto_log; + + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + + std::default_random_engine random; + const std::string rand_str = + RandomString(kAlNumAlphabet, string_length, &random); + + auto document_properties = document.add_properties(); + document_properties->set_name("string property"); + document_properties->add_string_values(rand_str); + + for (auto _ : state) { + testing::DoNotOptimize(proto_log->WriteProto(document)); + } + state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * + string_length); + + // Cleanup after ourselves + filesystem.DeleteFile(file_path.c_str()); +} +BENCHMARK(BM_Write) + ->Arg(1) + ->Arg(32) + ->Arg(512) + ->Arg(1024) + ->Arg(4 * 1024) + ->Arg(8 * 1024) + ->Arg(16 * 1024) + ->Arg(32 * 1024) + ->Arg(256 * 1024) + ->Arg(2 * 1024 * 1024) + ->Arg(8 * 1024 * 1024) + ->Arg(15 * 1024 * 1024); // We do 15MiB here since our max proto size is + // 16MiB, and we need some extra space for the + // rest of the document properties + +static void BM_Read(benchmark::State& state) { + const Filesystem filesystem; + int string_length = state.range(0); + const std::string file_path = IcingStringUtil::StringPrintf( + "%s%s%d%s", GetTestTempDir().c_str(), "/proto_", string_length, ".log"); + int max_proto_size = (1 << 24) - 1; // 16 MiB + bool compress = true; + + // Make sure it doesn't already exist. + filesystem.DeleteFile(file_path.c_str()); + + auto proto_log = + FileBackedProtoLog<DocumentProto>::Create( + &filesystem, file_path, + FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size)) + .ValueOrDie() + .proto_log; + + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + + std::default_random_engine random; + const std::string rand_str = + RandomString(kAlNumAlphabet, string_length, &random); + + auto document_properties = document.add_properties(); + document_properties->set_name("string property"); + document_properties->add_string_values(rand_str); + + ICING_ASSERT_OK_AND_ASSIGN(int64_t write_offset, + proto_log->WriteProto(document)); + + for (auto _ : state) { + testing::DoNotOptimize(proto_log->ReadProto(write_offset)); + } + state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * + string_length); + + // Cleanup after ourselves + filesystem.DeleteFile(file_path.c_str()); +} +BENCHMARK(BM_Read) + ->Arg(1) + ->Arg(32) + ->Arg(512) + ->Arg(1024) + ->Arg(4 * 1024) + ->Arg(8 * 1024) + ->Arg(16 * 1024) + ->Arg(32 * 1024) + ->Arg(256 * 1024) + ->Arg(2 * 1024 * 1024) + ->Arg(8 * 1024 * 1024) + ->Arg(15 * 1024 * 1024); // We do 15MiB here since our max proto size is + // 16MiB, and we need some extra space for the + // rest of the document properties + +static void BM_Erase(benchmark::State& state) { + const Filesystem filesystem; + const std::string file_path = IcingStringUtil::StringPrintf( + "%s%s", GetTestTempDir().c_str(), "/proto.log"); + int max_proto_size = (1 << 24) - 1; // 16 MiB + bool compress = true; + + // Make sure it doesn't already exist. + filesystem.DeleteFile(file_path.c_str()); + + auto proto_log = + FileBackedProtoLog<DocumentProto>::Create( + &filesystem, file_path, + FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size)) + .ValueOrDie() + .proto_log; + + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + + std::default_random_engine random; + const std::string rand_str = RandomString(kAlNumAlphabet, /*len=*/1, &random); + + auto document_properties = document.add_properties(); + document_properties->set_name("string property"); + document_properties->add_string_values(rand_str); + + for (auto _ : state) { + state.PauseTiming(); + ICING_ASSERT_OK_AND_ASSIGN(int64_t write_offset, + proto_log->WriteProto(document)); + state.ResumeTiming(); + + testing::DoNotOptimize(proto_log->EraseProto(write_offset)); + } + + // Cleanup after ourselves + filesystem.DeleteFile(file_path.c_str()); +} +BENCHMARK(BM_Erase); + +static void BM_ComputeChecksum(benchmark::State& state) { + const Filesystem filesystem; + const std::string file_path = GetTestTempDir() + "/proto.log"; + int max_proto_size = (1 << 24) - 1; // 16 MiB + bool compress = true; + + // Make sure it doesn't already exist. + filesystem.DeleteFile(file_path.c_str()); + + auto proto_log = + FileBackedProtoLog<DocumentProto>::Create( + &filesystem, file_path, + FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size)) + .ValueOrDie() + .proto_log; + + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + + // Make each document 1KiB + int string_length = 1024; + std::default_random_engine random; + const std::string rand_str = + RandomString(kAlNumAlphabet, string_length, &random); + + auto document_properties = document.add_properties(); + document_properties->set_name("string property"); + document_properties->add_string_values(rand_str); + + int num_docs = state.range(0); + for (int i = 0; i < num_docs; ++i) { + ICING_ASSERT_OK(proto_log->WriteProto(document)); + } + + for (auto _ : state) { + testing::DoNotOptimize(proto_log->ComputeChecksum()); + } + + // Cleanup after ourselves + filesystem.DeleteFile(file_path.c_str()); +} +BENCHMARK(BM_ComputeChecksum)->Range(1024, 1 << 20); + +} // namespace +} // namespace lib +} // namespace icing diff --git a/icing/file/file-backed-proto-log_test.cc b/icing/file/file-backed-proto-log_test.cc index eccb0c7..d429277 100644 --- a/icing/file/file-backed-proto-log_test.cc +++ b/icing/file/file-backed-proto-log_test.cc @@ -19,7 +19,10 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" +#include "icing/document-builder.h" #include "icing/file/filesystem.h" +#include "icing/file/mock-filesystem.h" +#include "icing/portable/equals-proto.h" #include "icing/proto/document.pb.h" #include "icing/testing/common-matchers.h" #include "icing/testing/tmp-directory.h" @@ -29,7 +32,14 @@ namespace lib { namespace { +using ::icing::lib::portable_equals_proto::EqualsProto; +using ::testing::A; +using ::testing::Eq; +using ::testing::Gt; +using ::testing::Not; using ::testing::NotNull; +using ::testing::Pair; +using ::testing::Return; class FileBackedProtoLogTest : public ::testing::Test { protected: @@ -77,6 +87,193 @@ TEST_F(FileBackedProtoLogTest, Initialize) { StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } +TEST_F(FileBackedProtoLogTest, WriteProtoTooLarge) { + int max_proto_size = 1; + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedProtoLog<DocumentProto>::CreateResult create_result, + FileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + FileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + + // Proto is too large for the max_proto_size_in + ASSERT_THAT(proto_log->WriteProto(document), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST_F(FileBackedProtoLogTest, ReadProtoWrongKProtoMagic) { + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedProtoLog<DocumentProto>::CreateResult create_result, + FileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + FileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Write a proto + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + + ICING_ASSERT_OK_AND_ASSIGN(int64_t file_offset, + proto_log->WriteProto(document)); + + // The 4 bytes of metadata that just doesn't have the same kProtoMagic + // specified in file-backed-proto-log.h + uint32_t wrong_magic = 0x7E000000; + + // Sanity check that we opened the file correctly + int fd = filesystem_.OpenForWrite(file_path_.c_str()); + ASSERT_GT(fd, 0); + + // Write the wrong kProtoMagic in, kProtoMagics are stored at the beginning of + // a proto entry. + filesystem_.PWrite(fd, file_offset, &wrong_magic, sizeof(wrong_magic)); + + ASSERT_THAT(proto_log->ReadProto(file_offset), + StatusIs(libtextclassifier3::StatusCode::INTERNAL)); +} + +TEST_F(FileBackedProtoLogTest, ReadWriteUncompressedProto) { + int last_offset; + { + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedProtoLog<DocumentProto>::CreateResult create_result, + FileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + FileBackedProtoLog<DocumentProto>::Options( + /*compress_in=*/false, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Write the first proto + DocumentProto document1 = + DocumentBuilder().SetKey("namespace1", "uri1").Build(); + + ICING_ASSERT_OK_AND_ASSIGN(int written_position, + proto_log->WriteProto(document1)); + + int document1_offset = written_position; + + // Check that what we read is what we wrote + ASSERT_THAT(proto_log->ReadProto(written_position), + IsOkAndHolds(EqualsProto(document1))); + + // Write a second proto that's close to the max size. Leave some room for + // the rest of the proto properties. + std::string long_str(max_proto_size_ - 1024, 'a'); + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace2", "uri2") + .AddStringProperty("long_str", long_str) + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN(written_position, + proto_log->WriteProto(document2)); + + int document2_offset = written_position; + last_offset = written_position; + ASSERT_GT(document2_offset, document1_offset); + + // Check the second proto + ASSERT_THAT(proto_log->ReadProto(written_position), + IsOkAndHolds(EqualsProto(document2))); + + ICING_ASSERT_OK(proto_log->PersistToDisk()); + } + + { + // Make a new proto_log with the same file_path, and make sure we + // can still write to the same underlying file. + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedProtoLog<DocumentProto>::CreateResult create_result, + FileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + FileBackedProtoLog<DocumentProto>::Options( + /*compress_in=*/false, max_proto_size_))); + auto recreated_proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Write a third proto + DocumentProto document3 = + DocumentBuilder().SetKey("namespace3", "uri3").Build(); + + ASSERT_THAT(recreated_proto_log->WriteProto(document3), + IsOkAndHolds(Gt(last_offset))); + } +} + +TEST_F(FileBackedProtoLogTest, ReadWriteCompressedProto) { + int last_offset; + + { + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedProtoLog<DocumentProto>::CreateResult create_result, + FileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + FileBackedProtoLog<DocumentProto>::Options( + /*compress_in=*/true, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Write the first proto + DocumentProto document1 = + DocumentBuilder().SetKey("namespace1", "uri1").Build(); + + ICING_ASSERT_OK_AND_ASSIGN(int written_position, + proto_log->WriteProto(document1)); + + int document1_offset = written_position; + + // Check that what we read is what we wrote + ASSERT_THAT(proto_log->ReadProto(written_position), + IsOkAndHolds(EqualsProto(document1))); + + // Write a second proto that's close to the max size. Leave some room for + // the rest of the proto properties. + std::string long_str(max_proto_size_ - 1024, 'a'); + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace2", "uri2") + .AddStringProperty("long_str", long_str) + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN(written_position, + proto_log->WriteProto(document2)); + + int document2_offset = written_position; + last_offset = written_position; + ASSERT_GT(document2_offset, document1_offset); + + // Check the second proto + ASSERT_THAT(proto_log->ReadProto(written_position), + IsOkAndHolds(EqualsProto(document2))); + + ICING_ASSERT_OK(proto_log->PersistToDisk()); + } + + { + // Make a new proto_log with the same file_path, and make sure we + // can still write to the same underlying file. + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedProtoLog<DocumentProto>::CreateResult create_result, + FileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + FileBackedProtoLog<DocumentProto>::Options( + /*compress_in=*/true, max_proto_size_))); + auto recreated_proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Write a third proto + DocumentProto document3 = + DocumentBuilder().SetKey("namespace3", "uri3").Build(); + + ASSERT_THAT(recreated_proto_log->WriteProto(document3), + IsOkAndHolds(Gt(last_offset))); + } +} + TEST_F(FileBackedProtoLogTest, CorruptHeader) { { ICING_ASSERT_OK_AND_ASSIGN( @@ -106,6 +303,382 @@ TEST_F(FileBackedProtoLogTest, CorruptHeader) { } } +TEST_F(FileBackedProtoLogTest, CorruptContent) { + { + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedProtoLog<DocumentProto>::CreateResult create_result, + FileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + FileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + EXPECT_FALSE(create_result.has_data_loss()); + + DocumentProto document = + DocumentBuilder().SetKey("namespace1", "uri1").Build(); + + // Write and persist an document. + ICING_ASSERT_OK_AND_ASSIGN(int document_offset, + proto_log->WriteProto(document)); + ICING_ASSERT_OK(proto_log->PersistToDisk()); + + // "Corrupt" the content written in the log. + document.set_uri("invalid"); + std::string serialized_document = document.SerializeAsString(); + filesystem_.PWrite(file_path_.c_str(), document_offset, + serialized_document.data(), serialized_document.size()); + } + + { + // We can recover, but we have data loss. + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedProtoLog<DocumentProto>::CreateResult create_result, + FileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + FileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_TRUE(create_result.has_data_loss()); + ASSERT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE)); + + // Lost everything in the log since the rewind position doesn't help if + // there's been data corruption within the persisted region + ASSERT_EQ(filesystem_.GetFileSize(file_path_.c_str()), + sizeof(FileBackedProtoLog<DocumentProto>::Header)); + } +} + +TEST_F(FileBackedProtoLogTest, PersistToDisk) { + DocumentProto document1 = + DocumentBuilder().SetKey("namespace1", "uri1").Build(); + DocumentProto document2 = + DocumentBuilder().SetKey("namespace2", "uri2").Build(); + int document1_offset, document2_offset; + int log_size; + + { + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedProtoLog<DocumentProto>::CreateResult create_result, + FileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + FileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Write and persist the first proto + ICING_ASSERT_OK_AND_ASSIGN(document1_offset, + proto_log->WriteProto(document1)); + ICING_ASSERT_OK(proto_log->PersistToDisk()); + + // Write, but don't explicitly persist the second proto + ICING_ASSERT_OK_AND_ASSIGN(document2_offset, + proto_log->WriteProto(document2)); + + // Check that what we read is what we wrote + ASSERT_THAT(proto_log->ReadProto(document1_offset), + IsOkAndHolds(EqualsProto(document1))); + ASSERT_THAT(proto_log->ReadProto(document2_offset), + IsOkAndHolds(EqualsProto(document2))); + + log_size = filesystem_.GetFileSize(file_path_.c_str()); + ASSERT_GT(log_size, 0); + } + + { + // The header rewind position and checksum aren't updated in this "system + // crash" scenario. + + std::string bad_proto = + "some incomplete proto that we didn't finish writing before the system " + "crashed"; + filesystem_.PWrite(file_path_.c_str(), log_size, bad_proto.data(), + bad_proto.size()); + + // Double check that we actually wrote something to the underlying file + ASSERT_GT(filesystem_.GetFileSize(file_path_.c_str()), log_size); + } + + { + // We can recover, but we have data loss + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedProtoLog<DocumentProto>::CreateResult create_result, + FileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + FileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_TRUE(create_result.has_data_loss()); + ASSERT_THAT(create_result.data_loss, Eq(DataLoss::PARTIAL)); + + // Check that everything was persisted across instances + ASSERT_THAT(proto_log->ReadProto(document1_offset), + IsOkAndHolds(EqualsProto(document1))); + ASSERT_THAT(proto_log->ReadProto(document2_offset), + IsOkAndHolds(EqualsProto(document2))); + + // We correctly rewound to the last good state. + ASSERT_EQ(log_size, filesystem_.GetFileSize(file_path_.c_str())); + } +} + +TEST_F(FileBackedProtoLogTest, Iterator) { + DocumentProto document1 = + DocumentBuilder().SetKey("namespace", "uri1").Build(); + DocumentProto document2 = + DocumentBuilder().SetKey("namespace", "uri2").Build(); + + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedProtoLog<DocumentProto>::CreateResult create_result, + FileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + FileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + { + // Empty iterator + auto iterator = proto_log->GetIterator(); + ASSERT_THAT(iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + } + + { + // Iterates through some documents + ICING_ASSERT_OK(proto_log->WriteProto(document1)); + ICING_ASSERT_OK(proto_log->WriteProto(document2)); + auto iterator = proto_log->GetIterator(); + // 1st proto + ICING_ASSERT_OK(iterator.Advance()); + ASSERT_THAT(proto_log->ReadProto(iterator.GetOffset()), + IsOkAndHolds(EqualsProto(document1))); + // 2nd proto + ICING_ASSERT_OK(iterator.Advance()); + ASSERT_THAT(proto_log->ReadProto(iterator.GetOffset()), + IsOkAndHolds(EqualsProto(document2))); + // Tries to advance + ASSERT_THAT(iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + } + + { + // Iterator with bad filesystem + MockFilesystem mock_filesystem; + ON_CALL(mock_filesystem, GetFileSize(A<const char *>())) + .WillByDefault(Return(Filesystem::kBadFileSize)); + FileBackedProtoLog<DocumentProto>::Iterator bad_iterator( + mock_filesystem, file_path_, /*initial_offset=*/0); + ASSERT_THAT(bad_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + } +} + +TEST_F(FileBackedProtoLogTest, ComputeChecksum) { + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + Crc32 checksum; + + { + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedProtoLog<DocumentProto>::CreateResult create_result, + FileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + FileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + ICING_EXPECT_OK(proto_log->WriteProto(document)); + + ICING_ASSERT_OK_AND_ASSIGN(checksum, proto_log->ComputeChecksum()); + + // Calling it twice with no changes should get us the same checksum + EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum))); + } + + { + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedProtoLog<DocumentProto>::CreateResult create_result, + FileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + FileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Checksum should be consistent across instances + EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum))); + + // PersistToDisk shouldn't affect the checksum value + ICING_EXPECT_OK(proto_log->PersistToDisk()); + EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum))); + + // Check that modifying the log leads to a different checksum + ICING_EXPECT_OK(proto_log->WriteProto(document)); + EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Not(Eq(checksum)))); + } +} + +TEST_F(FileBackedProtoLogTest, EraseProtoShouldSetZero) { + DocumentProto document1 = + DocumentBuilder().SetKey("namespace", "uri1").Build(); + + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedProtoLog<DocumentProto>::CreateResult create_result, + FileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + FileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Writes and erases proto + ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset, + proto_log->WriteProto(document1)); + ICING_ASSERT_OK(proto_log->EraseProto(document1_offset)); + + // Checks if the erased area is set to 0. + int64_t file_size = filesystem_.GetFileSize(file_path_.c_str()); + MemoryMappedFile mmapped_file(filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_ONLY); + + // document1_offset + sizeof(int) is the start byte of the proto where + // sizeof(int) is the size of the proto metadata. + mmapped_file.Remap(document1_offset + sizeof(int), file_size - 1); + for (size_t i = 0; i < mmapped_file.region_size(); ++i) { + ASSERT_THAT(mmapped_file.region()[i], Eq(0)); + } +} + +TEST_F(FileBackedProtoLogTest, EraseProtoShouldReturnNotFound) { + DocumentProto document1 = + DocumentBuilder().SetKey("namespace", "uri1").Build(); + DocumentProto document2 = + DocumentBuilder().SetKey("namespace", "uri2").Build(); + + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedProtoLog<DocumentProto>::CreateResult create_result, + FileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + FileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Writes 2 protos + ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset, + proto_log->WriteProto(document1)); + ICING_ASSERT_OK_AND_ASSIGN(int64_t document2_offset, + proto_log->WriteProto(document2)); + + // Erases the first proto + ICING_ASSERT_OK(proto_log->EraseProto(document1_offset)); + + // The first proto has been erased. + ASSERT_THAT(proto_log->ReadProto(document1_offset), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + // The second proto should be returned. + ASSERT_THAT(proto_log->ReadProto(document2_offset), + IsOkAndHolds(EqualsProto(document2))); +} + +TEST_F(FileBackedProtoLogTest, ChecksumShouldBeCorrectWithErasedProto) { + DocumentProto document1 = + DocumentBuilder().SetKey("namespace", "uri1").Build(); + DocumentProto document2 = + DocumentBuilder().SetKey("namespace", "uri2").Build(); + DocumentProto document3 = + DocumentBuilder().SetKey("namespace", "uri3").Build(); + DocumentProto document4 = + DocumentBuilder().SetKey("namespace", "uri4").Build(); + + int64_t document2_offset; + int64_t document3_offset; + + { + // Erase data after the rewind position. This won't update the checksum + // immediately. + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedProtoLog<DocumentProto>::CreateResult create_result, + FileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + FileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Writes 3 protos + ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset, + proto_log->WriteProto(document1)); + ICING_ASSERT_OK_AND_ASSIGN(document2_offset, + proto_log->WriteProto(document2)); + ICING_ASSERT_OK_AND_ASSIGN(document3_offset, + proto_log->WriteProto(document3)); + + // Erases the 1st proto, checksum won't be updated immediately because the + // rewind position is 0. + ICING_ASSERT_OK(proto_log->EraseProto(document1_offset)); + + EXPECT_THAT(proto_log->ComputeChecksum(), + IsOkAndHolds(Eq(Crc32(2293202502)))); + } // New checksum is updated in destructor. + + { + // Erase data before the rewind position. This will update the checksum + // immediately. + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedProtoLog<DocumentProto>::CreateResult create_result, + FileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + FileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Erases the 2nd proto that is now before the rewind position. Checksum is + // updated. + ICING_ASSERT_OK(proto_log->EraseProto(document2_offset)); + + EXPECT_THAT(proto_log->ComputeChecksum(), + IsOkAndHolds(Eq(Crc32(639634028)))); + } + + { + // Append data and erase data before the rewind position. This will update + // the checksum twice: in EraseProto() and destructor. + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedProtoLog<DocumentProto>::CreateResult create_result, + FileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + FileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Append a new document which is after the rewind position. + ICING_ASSERT_OK(proto_log->WriteProto(document4)); + + // Erases the 3rd proto that is now before the rewind position. Checksum is + // updated. + ICING_ASSERT_OK(proto_log->EraseProto(document3_offset)); + + EXPECT_THAT(proto_log->ComputeChecksum(), + IsOkAndHolds(Eq(Crc32(1990198693)))); + } // Checksum is updated with the newly appended document. + + { + // A successful creation means that the checksum matches. + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedProtoLog<DocumentProto>::CreateResult create_result, + FileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + FileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + EXPECT_FALSE(create_result.has_data_loss()); + } +} + } // namespace } // namespace lib } // namespace icing diff --git a/icing/file/file-backed-proto.h b/icing/file/file-backed-proto.h index d7d9bad..15a1953 100644 --- a/icing/file/file-backed-proto.h +++ b/icing/file/file-backed-proto.h @@ -63,17 +63,6 @@ class FileBackedProto { // file_path : Must be a path within in a directory that already exists. FileBackedProto(const Filesystem& filesystem, std::string_view file_path); - // Reset the internal file_path for the file backed proto. - // Example use: - // auto file_backed_proto1 = *FileBackedProto<Proto>::Create(...); - // auto file_backed_proto2 = *FileBackedProto<Proto>::Create(...); - // filesystem.SwapFiles(file1, file2); - // file_backed_proto1.SetSwappedFilepath(file2); - // file_backed_proto2.SetSwappedFilepath(file1); - void SetSwappedFilepath(std::string_view swapped_to_file_path) { - file_path_ = swapped_to_file_path; - } - // Returns a reference to the proto read from the file. It // internally caches the read proto so that future calls are fast. // @@ -110,7 +99,7 @@ class FileBackedProto { mutable absl_ports::shared_mutex mutex_; const Filesystem* const filesystem_; - std::string file_path_; + const std::string file_path_; mutable std::unique_ptr<ProtoT> cached_proto_ ICING_GUARDED_BY(mutex_); }; diff --git a/icing/file/file-backed-vector.h b/icing/file/file-backed-vector.h index 7e42e32..0989935 100644 --- a/icing/file/file-backed-vector.h +++ b/icing/file/file-backed-vector.h @@ -56,9 +56,10 @@ #ifndef ICING_FILE_FILE_BACKED_VECTOR_H_ #define ICING_FILE_FILE_BACKED_VECTOR_H_ +#include <inttypes.h> +#include <stdint.h> #include <sys/mman.h> -#include <cinttypes> #include <cstdint> #include <memory> #include <string> @@ -586,11 +587,8 @@ libtextclassifier3::Status FileBackedVector<T>::GrowIfNecessary( } int64_t current_file_size = filesystem_->GetFileSize(file_path_.c_str()); - if (current_file_size == Filesystem::kBadFileSize) { - return absl_ports::InternalError("Unable to retrieve file size."); - } - int64_t least_file_size_needed = sizeof(Header) + num_elements * sizeof(T); + if (least_file_size_needed <= current_file_size) { // Our underlying file can hold the target num_elements cause we've grown // before diff --git a/icing/file/file-backed-vector_test.cc b/icing/file/file-backed-vector_test.cc index ed94fa5..b05ce2d 100644 --- a/icing/file/file-backed-vector_test.cc +++ b/icing/file/file-backed-vector_test.cc @@ -14,30 +14,26 @@ #include "icing/file/file-backed-vector.h" -#include <unistd.h> +#include <errno.h> #include <algorithm> -#include <cerrno> #include <cstdint> #include <memory> #include <string_view> #include <vector> -#include "knowledge/cerebra/sense/text_classifier/lib3/utils/base/status.h" -#include "testing/base/public/gmock.h" -#include "testing/base/public/gunit.h" -#include "third_party/icing/file/filesystem.h" -#include "third_party/icing/file/memory-mapped-file.h" -#include "third_party/icing/file/mock-filesystem.h" -#include "third_party/icing/testing/common-matchers.h" -#include "third_party/icing/testing/tmp-directory.h" -#include "third_party/icing/util/crc32.h" -#include "third_party/icing/util/logging.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/file/filesystem.h" +#include "icing/file/memory-mapped-file.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/tmp-directory.h" +#include "icing/util/crc32.h" +#include "icing/util/logging.h" using ::testing::Eq; using ::testing::IsTrue; using ::testing::Pointee; -using ::testing::Return; namespace icing { namespace lib { @@ -78,8 +74,6 @@ class FileBackedVectorTest : public testing::Test { return std::string_view(vector->array() + idx, expected_len); } - const Filesystem& filesystem() const { return filesystem_; } - Filesystem filesystem_; std::string file_path_; int fd_; @@ -644,60 +638,6 @@ TEST_F(FileBackedVectorTest, InitNormalSucceeds) { } } -TEST_F(FileBackedVectorTest, RemapFailureStillValidInstance) { - auto mock_filesystem = std::make_unique<MockFilesystem>(); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<FileBackedVector<int>> vector, - FileBackedVector<int>::Create( - *mock_filesystem, file_path_, - MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); - - // 1. Write data to just before the first block resize. Running the test - // locally has determined that we'll first resize at 65531st entry. - constexpr int kResizingIndex = 16378; - for (int i = 0; i < kResizingIndex; ++i) { - ICING_ASSERT_OK(vector->Set(i, 7)); - } - - // 2. The next Set call should cause a resize and a remap. Make that remap - // fail. - int num_calls = 0; - auto open_lambda = [this, &num_calls](const char* file_name){ - if (++num_calls == 2) { - return -1; - } - return this->filesystem().OpenForWrite(file_name); - }; - ON_CALL(*mock_filesystem, OpenForWrite(_)).WillByDefault(open_lambda); - EXPECT_THAT(vector->Set(kResizingIndex, 7), - StatusIs(libtextclassifier3::StatusCode::INTERNAL)); - - // 3. We should still be able to call set correctly for earlier regions. - ICING_EXPECT_OK(vector->Set(kResizingIndex / 2, 9)); - EXPECT_THAT(vector->Get(kResizingIndex / 2), IsOkAndHolds(Pointee(Eq(9)))); -} - -TEST_F(FileBackedVectorTest, BadFileSizeDuringGrowReturnsError) { - auto mock_filesystem = std::make_unique<MockFilesystem>(); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<FileBackedVector<int>> vector, - FileBackedVector<int>::Create( - *mock_filesystem, file_path_, - MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); - - // At first, the vector is empty and has no mapping established. The first Set - // call will cause a Grow. - // During Grow, we will attempt to check the underlying file size to see if - // growing is actually necessary. Return an error on the call to GetFileSize. - ON_CALL(*mock_filesystem, GetFileSize(A<const char*>())) - .WillByDefault(Return(Filesystem::kBadFileSize)); - - // We should fail gracefully and return an INTERNAL error to indicate that - // there was an issue retrieving the file size. - EXPECT_THAT(vector->Set(0, 7), - StatusIs(libtextclassifier3::StatusCode::INTERNAL)); -} - } // namespace } // namespace lib diff --git a/icing/file/filesystem.cc b/icing/file/filesystem.cc index 82b8d98..0655cb9 100644 --- a/icing/file/filesystem.cc +++ b/icing/file/filesystem.cc @@ -16,6 +16,7 @@ #include <dirent.h> #include <dlfcn.h> +#include <errno.h> #include <fcntl.h> #include <fnmatch.h> #include <pthread.h> @@ -25,7 +26,6 @@ #include <unistd.h> #include <algorithm> -#include <cerrno> #include <cstdint> #include <unordered_set> diff --git a/icing/file/filesystem.h b/icing/file/filesystem.h index dd2c5d1..6bed8e6 100644 --- a/icing/file/filesystem.h +++ b/icing/file/filesystem.h @@ -17,9 +17,11 @@ #ifndef ICING_FILE_FILESYSTEM_H_ #define ICING_FILE_FILESYSTEM_H_ +#include <stdint.h> +#include <stdio.h> +#include <string.h> + #include <cstdint> -#include <cstdio> -#include <cstring> #include <memory> #include <string> #include <unordered_set> @@ -233,11 +235,6 @@ class Filesystem { // Increments to_increment by size if size is valid, or sets to_increment // to kBadFileSize if either size or to_increment is kBadFileSize. static void IncrementByOrSetInvalid(int64_t size, int64_t* to_increment); - - // Return -1 if file_size is invalid. Otherwise, return file_size. - static int64_t SanitizeFileSize(int64_t file_size) { - return (file_size != kBadFileSize) ? file_size : -1; - } }; // LINT.ThenChange(//depot/google3/icing/file/mock-filesystem.h) diff --git a/icing/file/memory-mapped-file.cc b/icing/file/memory-mapped-file.cc index 9ff3adb..bda01f2 100644 --- a/icing/file/memory-mapped-file.cc +++ b/icing/file/memory-mapped-file.cc @@ -70,10 +70,10 @@ void MemoryMappedFile::MemoryMappedFile::Unmap() { libtextclassifier3::Status MemoryMappedFile::Remap(size_t file_offset, size_t mmap_size) { - if (mmap_size == 0) { - // First unmap any previously mmapped region. - Unmap(); + // First unmap any previously mmapped region. + Unmap(); + if (mmap_size == 0) { // Nothing more to do. return libtextclassifier3::Status::OK; } @@ -118,19 +118,15 @@ libtextclassifier3::Status MemoryMappedFile::Remap(size_t file_offset, "Unable to open file meant to be mmapped: ", file_path_)); } - void* mmap_result = mmap(nullptr, adjusted_mmap_size, protection_flags, - mmap_flags, fd.get(), aligned_offset); + mmap_result_ = mmap(nullptr, adjusted_mmap_size, protection_flags, mmap_flags, + fd.get(), aligned_offset); - if (mmap_result == MAP_FAILED) { + if (mmap_result_ == MAP_FAILED) { + mmap_result_ = nullptr; return absl_ports::InternalError(absl_ports::StrCat( "Failed to mmap region due to error: ", strerror(errno))); } - // Now we know that we have successfully created a new mapping. We can free - // the old one and switch to the new one. - Unmap(); - - mmap_result_ = mmap_result; file_offset_ = file_offset; region_ = reinterpret_cast<char*>(mmap_result_) + alignment_adjustment; region_size_ = mmap_size; diff --git a/icing/file/portable-file-backed-proto-log.h b/icing/file/portable-file-backed-proto-log.h index 409ab96..825b763 100644 --- a/icing/file/portable-file-backed-proto-log.h +++ b/icing/file/portable-file-backed-proto-log.h @@ -64,6 +64,7 @@ #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" +#include <google/protobuf/io/gzip_stream.h> #include <google/protobuf/io/zero_copy_stream_impl_lite.h> #include "icing/absl_ports/canonical_errors.h" #include "icing/absl_ports/str_cat.h" @@ -71,7 +72,6 @@ #include "icing/file/memory-mapped-file.h" #include "icing/legacy/core/icing-string-util.h" #include "icing/portable/endian.h" -#include "icing/portable/gzip_stream.h" #include "icing/portable/platform.h" #include "icing/portable/zlib.h" #include "icing/util/bit-util.h" @@ -124,8 +124,6 @@ class PortableFileBackedProtoLog { public: static constexpr int32_t kMagic = 0xf4c6f67a; - // We should go directly from 0 to 2 the next time we have to change the - // format. static constexpr int32_t kFileFormatVersion = 0; uint32_t CalculateHeaderChecksum() const { @@ -143,57 +141,49 @@ class PortableFileBackedProtoLog { return crc.Get(); } - int32_t GetMagic() const { return GNetworkToHostL(magic_nbytes_); } + int32_t GetMagic() const { return gntohl(magic_nbytes_); } - void SetMagic(int32_t magic_in) { - magic_nbytes_ = GHostToNetworkL(magic_in); - } + void SetMagic(int32_t magic_in) { magic_nbytes_ = ghtonl(magic_in); } int32_t GetFileFormatVersion() const { - return GNetworkToHostL(file_format_version_nbytes_); + return gntohl(file_format_version_nbytes_); } void SetFileFormatVersion(int32_t file_format_version_in) { - file_format_version_nbytes_ = GHostToNetworkL(file_format_version_in); + file_format_version_nbytes_ = ghtonl(file_format_version_in); } - int32_t GetMaxProtoSize() const { - return GNetworkToHostL(max_proto_size_nbytes_); - } + int32_t GetMaxProtoSize() const { return gntohl(max_proto_size_nbytes_); } void SetMaxProtoSize(int32_t max_proto_size_in) { - max_proto_size_nbytes_ = GHostToNetworkL(max_proto_size_in); + max_proto_size_nbytes_ = ghtonl(max_proto_size_in); } - int32_t GetLogChecksum() const { - return GNetworkToHostL(log_checksum_nbytes_); - } + int32_t GetLogChecksum() const { return gntohl(log_checksum_nbytes_); } void SetLogChecksum(int32_t log_checksum_in) { - log_checksum_nbytes_ = GHostToNetworkL(log_checksum_in); + log_checksum_nbytes_ = ghtonl(log_checksum_in); } - int64_t GetRewindOffset() const { - return GNetworkToHostLL(rewind_offset_nbytes_); - } + int64_t GetRewindOffset() const { return gntohll(rewind_offset_nbytes_); } void SetRewindOffset(int64_t rewind_offset_in) { - rewind_offset_nbytes_ = GHostToNetworkLL(rewind_offset_in); + rewind_offset_nbytes_ = ghtonll(rewind_offset_in); } int32_t GetHeaderChecksum() const { - return GNetworkToHostL(header_checksum_nbytes_); + return gntohl(header_checksum_nbytes_); } void SetHeaderChecksum(int32_t header_checksum_in) { - header_checksum_nbytes_ = GHostToNetworkL(header_checksum_in); + header_checksum_nbytes_ = ghtonl(header_checksum_in); } bool GetCompressFlag() const { return GetFlag(kCompressBit); } void SetCompressFlag(bool compress) { SetFlag(kCompressBit, compress); } - bool GetDirtyFlag() const { return GetFlag(kDirtyBit); } + bool GetDirtyFlag() { return GetFlag(kDirtyBit); } void SetDirtyFlag(bool dirty) { SetFlag(kDirtyBit, dirty); } @@ -219,7 +209,7 @@ class PortableFileBackedProtoLog { // Holds the magic as a quick sanity check against file corruption. // // Field is in network-byte order. - int32_t magic_nbytes_ = GHostToNetworkL(kMagic); + int32_t magic_nbytes_ = ghtonl(kMagic); // Must be at the beginning after kMagic. Contains the crc checksum of // the following fields. @@ -233,7 +223,7 @@ class PortableFileBackedProtoLog { // valid instead of throwing away the entire log. // // Field is in network-byte order. - int64_t rewind_offset_nbytes_ = GHostToNetworkLL(kHeaderReservedBytes); + int64_t rewind_offset_nbytes_ = ghtonll(kHeaderReservedBytes); // Version number tracking how we serialize the file to disk. If we change // how/what we write to disk, this version should be updated and this class @@ -284,7 +274,7 @@ class PortableFileBackedProtoLog { // before updating our checksum. bool recalculated_checksum = false; - bool has_data_loss() const { + bool has_data_loss() { return data_loss == DataLoss::PARTIAL || data_loss == DataLoss::COMPLETE; } }; @@ -378,7 +368,8 @@ class PortableFileBackedProtoLog { // } class Iterator { public: - Iterator(const Filesystem& filesystem, int fd, int64_t initial_offset); + Iterator(const Filesystem& filesystem, const std::string& file_path, + int64_t initial_offset); // Advances to the position of next proto whether it has been erased or not. // @@ -394,12 +385,11 @@ class PortableFileBackedProtoLog { private: static constexpr int64_t kInvalidOffset = -1; // Used to read proto metadata + MemoryMappedFile mmapped_file_; // Offset of first proto - const Filesystem* const filesystem_; int64_t initial_offset_; int64_t current_offset_; int64_t file_size_; - int fd_; }; // Returns an iterator of current proto log. The caller needs to keep the @@ -515,7 +505,7 @@ class PortableFileBackedProtoLog { const Filesystem* filesystem, const std::string& file_path, Crc32 initial_crc, int64_t start, int64_t end); - // Reads out the metadata of a proto located at file_offset from the fd. + // Reads out the metadata of a proto located at file_offset from the file. // Metadata will be returned in host byte order endianness. // // Returns: @@ -523,8 +513,7 @@ class PortableFileBackedProtoLog { // OUT_OF_RANGE_ERROR if file_offset exceeds file_size // INTERNAL_ERROR if the metadata is invalid or any IO errors happen static libtextclassifier3::StatusOr<int32_t> ReadProtoMetadata( - const Filesystem* const filesystem, int fd, int64_t file_offset, - int64_t file_size); + MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size); // Writes metadata of a proto to the fd. Takes in a host byte order endianness // metadata and converts it into a portable metadata before writing. @@ -579,6 +568,9 @@ class PortableFileBackedProtoLog { }; template <typename ProtoT> +constexpr uint8_t PortableFileBackedProtoLog<ProtoT>::kProtoMagic; + +template <typename ProtoT> PortableFileBackedProtoLog<ProtoT>::PortableFileBackedProtoLog( const Filesystem* filesystem, const std::string& file_path, std::unique_ptr<Header> header) @@ -733,7 +725,7 @@ PortableFileBackedProtoLog<ProtoT>::InitializeExistingFile( return absl_ports::InternalError(IcingStringUtil::StringPrintf( "Failed to truncate '%s' to size %lld", file_path.data(), static_cast<long long>(header->GetRewindOffset()))); - } + }; data_loss = DataLoss::PARTIAL; } @@ -889,11 +881,12 @@ PortableFileBackedProtoLog<ProtoT>::WriteProto(const ProtoT& proto) { google::protobuf::io::StringOutputStream proto_stream(&proto_str); if (header_->GetCompressFlag()) { - protobuf_ports::GzipOutputStream::Options options; - options.format = protobuf_ports::GzipOutputStream::ZLIB; + google::protobuf::io::GzipOutputStream::Options options; + options.format = google::protobuf::io::GzipOutputStream::ZLIB; options.compression_level = kDeflateCompressionLevel; - protobuf_ports::GzipOutputStream compressing_stream(&proto_stream, options); + google::protobuf::io::GzipOutputStream compressing_stream(&proto_stream, + options); bool success = proto.SerializeToZeroCopyStream(&compressing_stream) && compressing_stream.Close(); @@ -940,42 +933,40 @@ template <typename ProtoT> libtextclassifier3::StatusOr<ProtoT> PortableFileBackedProtoLog<ProtoT>::ReadProto(int64_t file_offset) const { int64_t file_size = filesystem_->GetFileSize(fd_.get()); - // Read out the metadata - if (file_size == Filesystem::kBadFileSize) { - return absl_ports::OutOfRangeError("Unable to correctly read size."); + MemoryMappedFile mmapped_file(*filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_ONLY); + if (file_offset >= file_size) { + // file_size points to the next byte to write at, so subtract one to get + // the inclusive, actual size of file. + return absl_ports::OutOfRangeError( + IcingStringUtil::StringPrintf("Trying to read from a location, %lld, " + "out of range of the file size, %lld", + static_cast<long long>(file_offset), + static_cast<long long>(file_size - 1))); } + + // Read out the metadata ICING_ASSIGN_OR_RETURN( int32_t metadata, - ReadProtoMetadata(filesystem_, fd_.get(), file_offset, file_size)); + ReadProtoMetadata(&mmapped_file, file_offset, file_size)); // Copy out however many bytes it says the proto is int stored_size = GetProtoSize(metadata); - file_offset += sizeof(metadata); - // Read the compressed proto out. - if (file_offset + stored_size > file_size) { - return absl_ports::OutOfRangeError( - IcingStringUtil::StringPrintf("Trying to read from a location, %lld, " - "out of range of the file size, %lld", - static_cast<long long>(file_offset), - static_cast<long long>(file_size - 1))); - } - auto buf = std::make_unique<char[]>(stored_size); - if (!filesystem_->PRead(fd_.get(), buf.get(), stored_size, file_offset)) { - return absl_ports::InternalError(""); - } + ICING_RETURN_IF_ERROR( + mmapped_file.Remap(file_offset + sizeof(metadata), stored_size)); - if (IsEmptyBuffer(buf.get(), stored_size)) { + if (IsEmptyBuffer(mmapped_file.region(), mmapped_file.region_size())) { return absl_ports::NotFoundError("The proto data has been erased."); } - google::protobuf::io::ArrayInputStream proto_stream(buf.get(), - stored_size); + google::protobuf::io::ArrayInputStream proto_stream( + mmapped_file.mutable_region(), stored_size); // Deserialize proto ProtoT proto; if (header_->GetCompressFlag()) { - protobuf_ports::GzipInputStream decompress_stream(&proto_stream); + google::protobuf::io::GzipInputStream decompress_stream(&proto_stream); proto.ParseFromZeroCopyStream(&decompress_stream); } else { proto.ParseFromZeroCopyStream(&proto_stream); @@ -988,29 +979,33 @@ template <typename ProtoT> libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::EraseProto( int64_t file_offset) { int64_t file_size = filesystem_->GetFileSize(fd_.get()); - if (file_size == Filesystem::kBadFileSize) { - return absl_ports::OutOfRangeError("Unable to correctly read size."); + if (file_offset >= file_size) { + // file_size points to the next byte to write at, so subtract one to get + // the inclusive, actual size of file. + return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf( + "Trying to erase data at a location, %lld, " + "out of range of the file size, %lld", + static_cast<long long>(file_offset), + static_cast<long long>(file_size - 1))); } + MemoryMappedFile mmapped_file( + *filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC); + + // Read out the metadata ICING_ASSIGN_OR_RETURN( int32_t metadata, - ReadProtoMetadata(filesystem_, fd_.get(), file_offset, file_size)); - // Copy out however many bytes it says the proto is - int stored_size = GetProtoSize(metadata); - file_offset += sizeof(metadata); - if (file_offset + stored_size > file_size) { - return absl_ports::OutOfRangeError( - IcingStringUtil::StringPrintf("Trying to read from a location, %lld, " - "out of range of the file size, %lld", - static_cast<long long>(file_offset), - static_cast<long long>(file_size - 1))); - } - auto buf = std::make_unique<char[]>(stored_size); + ReadProtoMetadata(&mmapped_file, file_offset, file_size)); + + ICING_RETURN_IF_ERROR(mmapped_file.Remap(file_offset + sizeof(metadata), + GetProtoSize(metadata))); // We need to update the crc checksum if the erased area is before the // rewind position. int32_t new_crc; - if (file_offset < header_->GetRewindOffset()) { + int64_t erased_proto_offset = file_offset + sizeof(metadata); + if (erased_proto_offset < header_->GetRewindOffset()) { // Set to "dirty" before we start writing anything. header_->SetDirtyFlag(true); header_->SetHeaderChecksum(header_->CalculateHeaderChecksum()); @@ -1023,30 +1018,24 @@ libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::EraseProto( // We need to calculate [original string xor 0s]. // The xored string is the same as the original string because 0 xor 0 = // 0, 1 xor 0 = 1. - // Read the compressed proto out. - if (!filesystem_->PRead(fd_.get(), buf.get(), stored_size, file_offset)) { - return absl_ports::InternalError(""); - } - const std::string_view xored_str(buf.get(), stored_size); + const std::string_view xored_str(mmapped_file.region(), + mmapped_file.region_size()); Crc32 crc(header_->GetLogChecksum()); ICING_ASSIGN_OR_RETURN( - new_crc, - crc.UpdateWithXor(xored_str, - /*full_data_size=*/header_->GetRewindOffset() - - kHeaderReservedBytes, - /*position=*/file_offset - kHeaderReservedBytes)); + new_crc, crc.UpdateWithXor( + xored_str, + /*full_data_size=*/header_->GetRewindOffset() - + kHeaderReservedBytes, + /*position=*/erased_proto_offset - kHeaderReservedBytes)); } // Clear the region. - memset(buf.get(), '\0', stored_size); - if (!filesystem_->PWrite(fd_.get(), file_offset, buf.get(), stored_size)) { - return absl_ports::InternalError(""); - } + memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size()); // If we cleared something in our checksummed area, we should update our // checksum and reset our dirty bit. - if (file_offset < header_->GetRewindOffset()) { + if (erased_proto_offset < header_->GetRewindOffset()) { header_->SetDirtyFlag(false); header_->SetLogChecksum(new_crc); header_->SetHeaderChecksum(header_->CalculateHeaderChecksum()); @@ -1084,12 +1073,13 @@ PortableFileBackedProtoLog<ProtoT>::GetElementsFileSize() const { template <typename ProtoT> PortableFileBackedProtoLog<ProtoT>::Iterator::Iterator( - const Filesystem& filesystem, int fd, int64_t initial_offset) - : filesystem_(&filesystem), + const Filesystem& filesystem, const std::string& file_path, + int64_t initial_offset) + : mmapped_file_(filesystem, file_path, + MemoryMappedFile::Strategy::READ_ONLY), initial_offset_(initial_offset), current_offset_(kInvalidOffset), - fd_(fd) { - file_size_ = filesystem_->GetFileSize(fd_); + file_size_(filesystem.GetFileSize(file_path.c_str())) { if (file_size_ == Filesystem::kBadFileSize) { // Fails all Advance() calls file_size_ = 0; @@ -1106,7 +1096,7 @@ PortableFileBackedProtoLog<ProtoT>::Iterator::Advance() { // Jumps to the next proto position ICING_ASSIGN_OR_RETURN( int32_t metadata, - ReadProtoMetadata(filesystem_, fd_, current_offset_, file_size_)); + ReadProtoMetadata(&mmapped_file_, current_offset_, file_size_)); current_offset_ += sizeof(metadata) + GetProtoSize(metadata); } @@ -1128,15 +1118,14 @@ int64_t PortableFileBackedProtoLog<ProtoT>::Iterator::GetOffset() { template <typename ProtoT> typename PortableFileBackedProtoLog<ProtoT>::Iterator PortableFileBackedProtoLog<ProtoT>::GetIterator() { - return Iterator(*filesystem_, fd_.get(), + return Iterator(*filesystem_, file_path_, /*initial_offset=*/kHeaderReservedBytes); } template <typename ProtoT> libtextclassifier3::StatusOr<int32_t> PortableFileBackedProtoLog<ProtoT>::ReadProtoMetadata( - const Filesystem* const filesystem, int fd, int64_t file_offset, - int64_t file_size) { + MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size) { // Checks file_offset if (file_offset >= file_size) { return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf( @@ -1154,12 +1143,12 @@ PortableFileBackedProtoLog<ProtoT>::ReadProtoMetadata( static_cast<long long>(file_size))); } - if (!filesystem->PRead(fd, &portable_metadata, metadata_size, file_offset)) { - return absl_ports::InternalError(""); - } + // Reads metadata + ICING_RETURN_IF_ERROR(mmapped_file->Remap(file_offset, metadata_size)); + memcpy(&portable_metadata, mmapped_file->region(), metadata_size); // Need to switch it back to host order endianness after reading from disk. - int32_t host_order_metadata = GNetworkToHostL(portable_metadata); + int32_t host_order_metadata = gntohl(portable_metadata); // Checks magic number uint8_t stored_k_proto_magic = GetProtoMagic(host_order_metadata); @@ -1177,7 +1166,7 @@ libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::WriteProtoMetadata( const Filesystem* filesystem, int fd, int32_t host_order_metadata) { // Convert it into portable endian format before writing to disk - int32_t portable_metadata = GHostToNetworkL(host_order_metadata); + int32_t portable_metadata = ghtonl(host_order_metadata); int portable_metadata_size = sizeof(portable_metadata); // Write metadata @@ -1197,7 +1186,21 @@ libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::PersistToDisk() { return libtextclassifier3::Status::OK; } - ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum()); + int64_t new_content_size = file_size - header_->GetRewindOffset(); + Crc32 crc; + if (new_content_size < 0) { + // File shrunk, recalculate the entire checksum. + ICING_ASSIGN_OR_RETURN( + crc, + ComputeChecksum(filesystem_, file_path_, Crc32(), + /*start=*/kHeaderReservedBytes, /*end=*/file_size)); + } else { + // Append new changes to the existing checksum. + ICING_ASSIGN_OR_RETURN( + crc, ComputeChecksum(filesystem_, file_path_, + Crc32(header_->GetLogChecksum()), + header_->GetRewindOffset(), file_size)); + } header_->SetLogChecksum(crc.Get()); header_->SetRewindOffset(file_size); @@ -1216,26 +1219,9 @@ libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::PersistToDisk() { template <typename ProtoT> libtextclassifier3::StatusOr<Crc32> PortableFileBackedProtoLog<ProtoT>::ComputeChecksum() { - int64_t file_size = filesystem_->GetFileSize(file_path_.c_str()); - int64_t new_content_size = file_size - header_->GetRewindOffset(); - Crc32 crc; - if (new_content_size == 0) { - // No new protos appended, return cached checksum - return Crc32(header_->GetLogChecksum()); - } else if (new_content_size < 0) { - // File shrunk, recalculate the entire checksum. - ICING_ASSIGN_OR_RETURN( - crc, - ComputeChecksum(filesystem_, file_path_, Crc32(), - /*start=*/kHeaderReservedBytes, /*end=*/file_size)); - } else { - // Append new changes to the existing checksum. - ICING_ASSIGN_OR_RETURN( - crc, ComputeChecksum( - filesystem_, file_path_, Crc32(header_->GetLogChecksum()), - /*start=*/header_->GetRewindOffset(), /*end=*/file_size)); - } - return crc; + return PortableFileBackedProtoLog<ProtoT>::ComputeChecksum( + filesystem_, file_path_, Crc32(), /*start=*/kHeaderReservedBytes, + /*end=*/filesystem_->GetFileSize(file_path_.c_str())); } } // namespace lib diff --git a/icing/file/portable-file-backed-proto-log_benchmark.cc b/icing/file/portable-file-backed-proto-log_benchmark.cc index 80a8011..04ccab0 100644 --- a/icing/file/portable-file-backed-proto-log_benchmark.cc +++ b/icing/file/portable-file-backed-proto-log_benchmark.cc @@ -55,7 +55,7 @@ namespace lib { namespace { -void BM_Write(benchmark::State& state) { +static void BM_Write(benchmark::State& state) { const Filesystem filesystem; int string_length = state.range(0); const std::string file_path = IcingStringUtil::StringPrintf( @@ -108,7 +108,7 @@ BENCHMARK(BM_Write) // 16MiB, and we need some extra space for the // rest of the document properties -void BM_Read(benchmark::State& state) { +static void BM_Read(benchmark::State& state) { const Filesystem filesystem; int string_length = state.range(0); const std::string file_path = IcingStringUtil::StringPrintf( @@ -164,7 +164,7 @@ BENCHMARK(BM_Read) // 16MiB, and we need some extra space for the // rest of the document properties // -void BM_Erase(benchmark::State& state) { +static void BM_Erase(benchmark::State& state) { const Filesystem filesystem; const std::string file_path = IcingStringUtil::StringPrintf( "%s%s", GetTestTempDir().c_str(), "/proto.log"); @@ -204,7 +204,7 @@ void BM_Erase(benchmark::State& state) { } BENCHMARK(BM_Erase); -void BM_ComputeChecksum(benchmark::State& state) { +static void BM_ComputeChecksum(benchmark::State& state) { const Filesystem filesystem; const std::string file_path = GetTestTempDir() + "/proto.log"; int max_proto_size = (1 << 24) - 1; // 16 MiB @@ -246,98 +246,6 @@ void BM_ComputeChecksum(benchmark::State& state) { } BENCHMARK(BM_ComputeChecksum)->Range(1024, 1 << 20); -void BM_ComputeChecksumWithCachedChecksum(benchmark::State& state) { - const Filesystem filesystem; - const std::string file_path = GetTestTempDir() + "/proto.log"; - int max_proto_size = (1 << 24) - 1; // 16 MiB - bool compress = true; - - // Make sure it doesn't already exist. - filesystem.DeleteFile(file_path.c_str()); - - auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create( - &filesystem, file_path, - PortableFileBackedProtoLog<DocumentProto>::Options( - compress, max_proto_size)) - .ValueOrDie() - .proto_log; - - DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); - - // Make the document 1KiB - int string_length = 1024; - std::default_random_engine random; - const std::string rand_str = - RandomString(kAlNumAlphabet, string_length, &random); - - auto document_properties = document.add_properties(); - document_properties->set_name("string property"); - document_properties->add_string_values(rand_str); - - // Write some content and persist. This should update our cached checksum to - // include the document. - ICING_ASSERT_OK(proto_log->WriteProto(document)); - ICING_ASSERT_OK(proto_log->PersistToDisk()); - - // This ComputeChecksum call shouldn't need to do any computation since we can - // reuse our cached checksum. - for (auto _ : state) { - testing::DoNotOptimize(proto_log->ComputeChecksum()); - } - - // Cleanup after ourselves - filesystem.DeleteFile(file_path.c_str()); -} -BENCHMARK(BM_ComputeChecksumWithCachedChecksum); - -void BM_ComputeChecksumOnlyForTail(benchmark::State& state) { - const Filesystem filesystem; - const std::string file_path = GetTestTempDir() + "/proto.log"; - int max_proto_size = (1 << 24) - 1; // 16 MiB - bool compress = true; - - // Make sure it doesn't already exist. - filesystem.DeleteFile(file_path.c_str()); - - auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create( - &filesystem, file_path, - PortableFileBackedProtoLog<DocumentProto>::Options( - compress, max_proto_size)) - .ValueOrDie() - .proto_log; - - DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); - - // Make the document 1KiB - int string_length = 1024; - std::default_random_engine random; - const std::string rand_str = - RandomString(kAlNumAlphabet, string_length, &random); - - auto document_properties = document.add_properties(); - document_properties->set_name("string property"); - document_properties->add_string_values(rand_str); - - // Write some content and persist. This should update our cached checksum to - // include the document. - ICING_ASSERT_OK(proto_log->WriteProto(document)); - ICING_ASSERT_OK(proto_log->PersistToDisk()); - - // Write another proto into the tail, but it's not included in our cached - // checksum since we didn't call persist. - ICING_ASSERT_OK(proto_log->WriteProto(document)); - - // ComputeChecksum should be calculating the checksum of the tail and adding - // it to the cached checksum we have. - for (auto _ : state) { - testing::DoNotOptimize(proto_log->ComputeChecksum()); - } - - // Cleanup after ourselves - filesystem.DeleteFile(file_path.c_str()); -} -BENCHMARK(BM_ComputeChecksumOnlyForTail); - } // namespace } // namespace lib } // namespace icing diff --git a/icing/file/portable-file-backed-proto-log_test.cc b/icing/file/portable-file-backed-proto-log_test.cc index 795271a..b5fee4b 100644 --- a/icing/file/portable-file-backed-proto-log_test.cc +++ b/icing/file/portable-file-backed-proto-log_test.cc @@ -851,12 +851,11 @@ TEST_F(PortableFileBackedProtoLogTest, Iterator) { { // Iterator with bad filesystem - ScopedFd sfd(filesystem_.OpenForRead(file_path_.c_str())); MockFilesystem mock_filesystem; - ON_CALL(mock_filesystem, GetFileSize(A<int>())) + ON_CALL(mock_filesystem, GetFileSize(A<const char*>())) .WillByDefault(Return(Filesystem::kBadFileSize)); PortableFileBackedProtoLog<DocumentProto>::Iterator bad_iterator( - mock_filesystem, sfd.get(), /*initial_offset=*/0); + mock_filesystem, file_path_, /*initial_offset=*/0); ASSERT_THAT(bad_iterator.Advance(), StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); } diff --git a/icing/testing/icu-data-file-helper.cc b/icing/helpers/icu/icu-data-file-helper.cc index aaeb738..6607c40 100644 --- a/icing/testing/icu-data-file-helper.cc +++ b/icing/helpers/icu/icu-data-file-helper.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/testing/icu-data-file-helper.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include <sys/mman.h> diff --git a/icing/testing/icu-data-file-helper.h b/icing/helpers/icu/icu-data-file-helper.h index d0276e7..90f5bc7 100644 --- a/icing/testing/icu-data-file-helper.h +++ b/icing/helpers/icu/icu-data-file-helper.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef ICING_TESTING_ICU_DATA_FILE_HELPER -#define ICING_TESTING_ICU_DATA_FILE_HELPER +#ifndef ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER +#define ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER #include "icing/text_classifier/lib3/utils/base/status.h" @@ -40,4 +40,4 @@ libtextclassifier3::Status SetUpICUDataFile( } // namespace lib } // namespace icing -#endif // ICING_TESTING_ICU_DATA_FILE_HELPER +#endif // ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER diff --git a/icing/icing-search-engine-with-icu-file_test.cc b/icing/icing-search-engine-with-icu-file_test.cc index 1012b47..48e81e5 100644 --- a/icing/icing-search-engine-with-icu-file_test.cc +++ b/icing/icing-search-engine-with-icu-file_test.cc @@ -37,13 +37,13 @@ namespace { using ::icing::lib::portable_equals_proto::EqualsProto; using ::testing::Eq; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED = - PropertyConfigProto::Cardinality::REQUIRED; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED = + PropertyConfigProto_Cardinality_Code_REQUIRED; -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = + StringIndexingConfig_TokenizerType_Code_PLAIN; -constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; +constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; std::string GetTestBaseDir() { return GetTestTempDir() + "/icing_with_icu_files"; diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc index 952ba21..20a6bb9 100644 --- a/icing/icing-search-engine.cc +++ b/icing/icing-search-engine.cc @@ -18,7 +18,6 @@ #include <memory> #include <string> #include <string_view> -#include <unordered_map> #include <utility> #include <vector> @@ -36,7 +35,6 @@ #include "icing/index/index.h" #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/legacy/index/icing-filesystem.h" -#include "icing/portable/endian.h" #include "icing/proto/document.pb.h" #include "icing/proto/initialize.pb.h" #include "icing/proto/internal/optimize.pb.h" @@ -48,7 +46,6 @@ #include "icing/proto/search.pb.h" #include "icing/proto/status.pb.h" #include "icing/query/query-processor.h" -#include "icing/query/suggestion-processor.h" #include "icing/result/projection-tree.h" #include "icing/result/projector.h" #include "icing/result/result-retriever.h" @@ -60,7 +57,6 @@ #include "icing/scoring/scoring-processor.h" #include "icing/store/document-id.h" #include "icing/store/document-store.h" -#include "icing/store/namespace-checker-impl.h" #include "icing/tokenization/language-segmenter-factory.h" #include "icing/tokenization/language-segmenter.h" #include "icing/transform/normalizer-factory.h" @@ -81,30 +77,19 @@ constexpr std::string_view kDocumentSubfolderName = "document_dir"; constexpr std::string_view kIndexSubfolderName = "index_dir"; constexpr std::string_view kSchemaSubfolderName = "schema_dir"; constexpr std::string_view kSetSchemaMarkerFilename = "set_schema_marker"; -constexpr std::string_view kInitMarkerFilename = "init_marker"; constexpr std::string_view kOptimizeStatusFilename = "optimize_status"; -// The maximum number of unsuccessful initialization attempts from the current -// state that we will tolerate before deleting all data and starting from a -// fresh state. -constexpr int kMaxUnsuccessfulInitAttempts = 5; - -// A pair that holds namespace and type. -struct NamespaceTypePair { - std::string namespace_; - std::string type; - - bool operator==(const NamespaceTypePair& other) const { - return namespace_ == other.namespace_ && type == other.type; - } -}; - -struct NamespaceTypePairHasher { - std::size_t operator()(const NamespaceTypePair& pair) const { - return std::hash<std::string>()(pair.namespace_) ^ - std::hash<std::string>()(pair.type); +libtextclassifier3::Status ValidateOptions( + const IcingSearchEngineOptions& options) { + // These options are only used in IndexProcessor, which won't be created + // until the first Put call. So they must be checked here, so that any + // errors can be surfaced in Initialize. + if (options.max_tokens_per_doc() <= 0) { + return absl_ports::InvalidArgumentError( + "Options::max_tokens_per_doc must be greater than zero."); } -}; + return libtextclassifier3::Status::OK; +} libtextclassifier3::Status ValidateResultSpec( const ResultSpecProto& result_spec) { @@ -142,29 +127,14 @@ libtextclassifier3::Status ValidateSearchSpec( return libtextclassifier3::Status::OK; } -libtextclassifier3::Status ValidateSuggestionSpec( - const SuggestionSpecProto& suggestion_spec, - const PerformanceConfiguration& configuration) { - if (suggestion_spec.prefix().empty()) { - return absl_ports::InvalidArgumentError( - absl_ports::StrCat("SuggestionSpecProto.prefix is empty!")); - } - if (suggestion_spec.scoring_spec().scoring_match_type() == - TermMatchType::UNKNOWN) { - return absl_ports::InvalidArgumentError( - absl_ports::StrCat("SuggestionSpecProto.term_match_type is unknown!")); - } - if (suggestion_spec.num_to_return() <= 0) { - return absl_ports::InvalidArgumentError(absl_ports::StrCat( - "SuggestionSpecProto.num_to_return must be positive.")); - } - if (suggestion_spec.prefix().size() > configuration.max_query_length) { - return absl_ports::InvalidArgumentError( - absl_ports::StrCat("SuggestionSpecProto.prefix is longer than the " - "maximum allowed prefix length: ", - std::to_string(configuration.max_query_length))); - } - return libtextclassifier3::Status::OK; +IndexProcessor::Options CreateIndexProcessorOptions( + const IcingSearchEngineOptions& options) { + IndexProcessor::Options index_processor_options; + index_processor_options.max_tokens_per_document = + options.max_tokens_per_doc(); + index_processor_options.token_limit_behavior = + IndexProcessor::Options::TokenLimitBehavior::kSuppressError; + return index_processor_options; } // Document store files are in a standalone subfolder for easier file @@ -194,15 +164,10 @@ std::string MakeIndexDirectoryPath(const std::string& base_dir) { std::string MakeSchemaDirectoryPath(const std::string& base_dir) { return absl_ports::StrCat(base_dir, "/", kSchemaSubfolderName); } - std::string MakeSetSchemaMarkerFilePath(const std::string& base_dir) { return absl_ports::StrCat(base_dir, "/", kSetSchemaMarkerFilename); } -std::string MakeInitMarkerFilePath(const std::string& base_dir) { - return absl_ports::StrCat(base_dir, "/", kInitMarkerFilename); -} - void TransformStatus(const libtextclassifier3::Status& internal_status, StatusProto* status_proto) { StatusProto::Code code; @@ -273,28 +238,6 @@ void TransformStatus(const libtextclassifier3::Status& internal_status, status_proto->set_message(internal_status.error_message()); } -libtextclassifier3::Status RetrieveAndAddDocumentInfo( - const DocumentStore* document_store, DeleteByQueryResultProto& result_proto, - std::unordered_map<NamespaceTypePair, - DeleteByQueryResultProto::DocumentGroupInfo*, - NamespaceTypePairHasher>& info_map, - DocumentId document_id) { - ICING_ASSIGN_OR_RETURN(DocumentProto document, - document_store->Get(document_id)); - NamespaceTypePair key = {document.namespace_(), document.schema()}; - auto iter = info_map.find(key); - if (iter == info_map.end()) { - auto entry = result_proto.add_deleted_documents(); - entry->set_namespace_(std::move(document.namespace_())); - entry->set_schema(std::move(document.schema())); - entry->add_uris(std::move(document.uri())); - info_map[key] = entry; - } else { - iter->second->add_uris(std::move(document.uri())); - } - return libtextclassifier3::Status::OK; -} - } // namespace IcingSearchEngine::IcingSearchEngine(const IcingSearchEngineOptions& options, @@ -333,66 +276,6 @@ InitializeResultProto IcingSearchEngine::Initialize() { return InternalInitialize(); } -void IcingSearchEngine::ResetMembers() { - schema_store_.reset(); - document_store_.reset(); - language_segmenter_.reset(); - normalizer_.reset(); - index_.reset(); -} - -libtextclassifier3::Status IcingSearchEngine::CheckInitMarkerFile( - InitializeStatsProto* initialize_stats) { - // Check to see if the marker file exists and if we've already passed our max - // number of init attempts. - std::string marker_filepath = MakeInitMarkerFilePath(options_.base_dir()); - bool file_exists = filesystem_->FileExists(marker_filepath.c_str()); - int network_init_attempts = 0; - int host_init_attempts = 0; - - // Read the number of previous failed init attempts from the file. If it - // fails, then just assume the value is zero (the most likely reason for - // failure would be non-existence because the last init was successful - // anyways). - ScopedFd marker_file_fd(filesystem_->OpenForWrite(marker_filepath.c_str())); - libtextclassifier3::Status status; - if (file_exists && - filesystem_->PRead(marker_file_fd.get(), &network_init_attempts, - sizeof(network_init_attempts), /*offset=*/0)) { - host_init_attempts = GNetworkToHostL(network_init_attempts); - if (host_init_attempts > kMaxUnsuccessfulInitAttempts) { - // We're tried and failed to init too many times. We need to throw - // everything out and start from scratch. - ResetMembers(); - if (!filesystem_->DeleteDirectoryRecursively( - options_.base_dir().c_str())) { - return absl_ports::InternalError("Failed to delete icing base dir!"); - } - status = absl_ports::DataLossError( - "Encountered failed initialization limit. Cleared all data."); - host_init_attempts = 0; - } - } - - // Use network_init_attempts here because we might have set host_init_attempts - // to 0 if it exceeded the max threshold. - initialize_stats->set_num_previous_init_failures( - GNetworkToHostL(network_init_attempts)); - - ++host_init_attempts; - network_init_attempts = GHostToNetworkL(host_init_attempts); - // Write the updated number of attempts before we get started. - if (!filesystem_->PWrite(marker_file_fd.get(), /*offset=*/0, - &network_init_attempts, - sizeof(network_init_attempts)) || - !filesystem_->DataSync(marker_file_fd.get())) { - return absl_ports::InternalError( - "Failed to write and sync init marker file"); - } - - return status; -} - InitializeResultProto IcingSearchEngine::InternalInitialize() { ICING_VLOG(1) << "Initializing IcingSearchEngine in dir: " << options_.base_dir(); @@ -413,17 +296,9 @@ InitializeResultProto IcingSearchEngine::InternalInitialize() { return result_proto; } - // Now go ahead and try to initialize. libtextclassifier3::Status status = InitializeMembers(initialize_stats); if (status.ok() || absl_ports::IsDataLoss(status)) { - // We successfully initialized. We should delete the init marker file to - // indicate a successful init. - std::string marker_filepath = MakeInitMarkerFilePath(options_.base_dir()); - if (!filesystem_->DeleteFile(marker_filepath.c_str())) { - status = absl_ports::InternalError("Failed to delete init marker file!"); - } else { - initialized_ = true; - } + initialized_ = true; } TransformStatus(status, result_status); initialize_stats->set_latency_ms(initialize_timer->GetElapsedMilliseconds()); @@ -433,20 +308,7 @@ InitializeResultProto IcingSearchEngine::InternalInitialize() { libtextclassifier3::Status IcingSearchEngine::InitializeMembers( InitializeStatsProto* initialize_stats) { ICING_RETURN_ERROR_IF_NULL(initialize_stats); - - // Make sure the base directory exists - if (!filesystem_->CreateDirectoryRecursively(options_.base_dir().c_str())) { - return absl_ports::InternalError(absl_ports::StrCat( - "Could not create directory: ", options_.base_dir())); - } - - // Check to see if the marker file exists and if we've already passed our max - // number of init attempts. - libtextclassifier3::Status status = CheckInitMarkerFile(initialize_stats); - if (!status.ok() && !absl_ports::IsDataLoss(status)) { - return status; - } - + ICING_RETURN_IF_ERROR(InitializeOptions()); ICING_RETURN_IF_ERROR(InitializeSchemaStore(initialize_stats)); // TODO(b/156383798) : Resolve how to specify the locale. @@ -460,7 +322,7 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers( std::string marker_filepath = MakeSetSchemaMarkerFilePath(options_.base_dir()); - libtextclassifier3::Status index_init_status; + libtextclassifier3::Status status; if (absl_ports::IsNotFound(schema_store_->GetSchema().status())) { // The schema was either lost or never set before. Wipe out the doc store // and index directories and initialize them from scratch. @@ -474,15 +336,14 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers( } ICING_RETURN_IF_ERROR(InitializeDocumentStore( /*force_recovery_and_revalidate_documents=*/false, initialize_stats)); - index_init_status = InitializeIndex(initialize_stats); - if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) { - return index_init_status; - } + status = InitializeIndex(initialize_stats); } else if (filesystem_->FileExists(marker_filepath.c_str())) { // If the marker file is still around then something wonky happened when we // last tried to set the schema. ICING_RETURN_IF_ERROR(InitializeDocumentStore( /*force_recovery_and_revalidate_documents=*/true, initialize_stats)); + initialize_stats->set_document_store_recovery_cause( + InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC); // We're going to need to build the index from scratch. So just delete its // files now. @@ -499,12 +360,12 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers( std::unique_ptr<Timer> restore_timer = clock_->GetNewTimer(); IndexRestorationResult restore_result = RestoreIndexIfNeeded(); - index_init_status = std::move(restore_result.status); + status = std::move(restore_result.status); // DATA_LOSS means that we have successfully initialized and re-added // content to the index. Some indexed content was lost, but otherwise the // index is in a valid state and can be queried. - if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) { - return index_init_status; + if (!status.ok() && !absl_ports::IsDataLoss(status)) { + return status; } // Delete the marker file to indicate that everything is now in sync with @@ -518,22 +379,30 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers( } else { ICING_RETURN_IF_ERROR(InitializeDocumentStore( /*force_recovery_and_revalidate_documents=*/false, initialize_stats)); - index_init_status = InitializeIndex(initialize_stats); - if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) { - return index_init_status; + status = InitializeIndex(initialize_stats); + if (!status.ok() && !absl_ports::IsDataLoss(status)) { + return status; } } - if (status.ok()) { - status = index_init_status; - } - result_state_manager_ = std::make_unique<ResultStateManager>( performance_configuration_.max_num_total_hits, *document_store_); return status; } +libtextclassifier3::Status IcingSearchEngine::InitializeOptions() { + ICING_RETURN_IF_ERROR(ValidateOptions(options_)); + + // Make sure the base directory exists + if (!filesystem_->CreateDirectoryRecursively(options_.base_dir().c_str())) { + return absl_ports::InternalError(absl_ports::StrCat( + "Could not create directory: ", options_.base_dir())); + } + + return libtextclassifier3::Status::OK; +} + libtextclassifier3::Status IcingSearchEngine::InitializeSchemaStore( InitializeStatsProto* initialize_stats) { ICING_RETURN_ERROR_IF_NULL(initialize_stats); @@ -633,18 +502,15 @@ SetSchemaResultProto IcingSearchEngine::SetSchema( StatusProto* result_status = result_proto.mutable_status(); absl_ports::unique_lock l(&mutex_); - std::unique_ptr<Timer> timer = clock_->GetNewTimer(); if (!initialized_) { result_status->set_code(StatusProto::FAILED_PRECONDITION); result_status->set_message("IcingSearchEngine has not been initialized!"); - result_proto.set_latency_ms(timer->GetElapsedMilliseconds()); return result_proto; } auto lost_previous_schema_or = LostPreviousSchema(); if (!lost_previous_schema_or.ok()) { TransformStatus(lost_previous_schema_or.status(), result_status); - result_proto.set_latency_ms(timer->GetElapsedMilliseconds()); return result_proto; } bool lost_previous_schema = lost_previous_schema_or.ValueOrDie(); @@ -662,11 +528,10 @@ SetSchemaResultProto IcingSearchEngine::SetSchema( std::move(new_schema), ignore_errors_and_delete_documents); if (!set_schema_result_or.ok()) { TransformStatus(set_schema_result_or.status(), result_status); - result_proto.set_latency_ms(timer->GetElapsedMilliseconds()); return result_proto; } - SchemaStore::SetSchemaResult set_schema_result = - std::move(set_schema_result_or).ValueOrDie(); + const SchemaStore::SetSchemaResult set_schema_result = + set_schema_result_or.ValueOrDie(); for (const std::string& deleted_type : set_schema_result.schema_types_deleted_by_name) { @@ -678,25 +543,6 @@ SetSchemaResultProto IcingSearchEngine::SetSchema( result_proto.add_incompatible_schema_types(incompatible_type); } - for (const std::string& new_type : - set_schema_result.schema_types_new_by_name) { - result_proto.add_new_schema_types(std::move(new_type)); - } - - for (const std::string& compatible_type : - set_schema_result.schema_types_changed_fully_compatible_by_name) { - result_proto.add_fully_compatible_changed_schema_types( - std::move(compatible_type)); - } - - bool index_incompatible = - !set_schema_result.schema_types_index_incompatible_by_name.empty(); - for (const std::string& index_incompatible_type : - set_schema_result.schema_types_index_incompatible_by_name) { - result_proto.add_index_incompatible_changed_schema_types( - std::move(index_incompatible_type)); - } - libtextclassifier3::Status status; if (set_schema_result.success) { if (lost_previous_schema) { @@ -705,7 +551,6 @@ SetSchemaResultProto IcingSearchEngine::SetSchema( status = document_store_->UpdateSchemaStore(schema_store_.get()); if (!status.ok()) { TransformStatus(status, result_status); - result_proto.set_latency_ms(timer->GetElapsedMilliseconds()); return result_proto; } } else if (!set_schema_result.old_schema_type_ids_changed.empty() || @@ -715,17 +560,15 @@ SetSchemaResultProto IcingSearchEngine::SetSchema( set_schema_result); if (!status.ok()) { TransformStatus(status, result_status); - result_proto.set_latency_ms(timer->GetElapsedMilliseconds()); return result_proto; } } - if (lost_previous_schema || index_incompatible) { + if (lost_previous_schema || set_schema_result.index_incompatible) { // Clears all index files status = index_->Reset(); if (!status.ok()) { TransformStatus(status, result_status); - result_proto.set_latency_ms(timer->GetElapsedMilliseconds()); return result_proto; } @@ -736,7 +579,6 @@ SetSchemaResultProto IcingSearchEngine::SetSchema( if (!restore_result.status.ok() && !absl_ports::IsDataLoss(restore_result.status)) { TransformStatus(status, result_status); - result_proto.set_latency_ms(timer->GetElapsedMilliseconds()); return result_proto; } } @@ -747,7 +589,6 @@ SetSchemaResultProto IcingSearchEngine::SetSchema( result_status->set_message("Schema is incompatible."); } - result_proto.set_latency_ms(timer->GetElapsedMilliseconds()); return result_proto; } @@ -841,8 +682,9 @@ PutResultProto IcingSearchEngine::Put(DocumentProto&& document) { } DocumentId document_id = document_id_or.ValueOrDie(); - auto index_processor_or = - IndexProcessor::Create(normalizer_.get(), index_.get(), clock_.get()); + auto index_processor_or = IndexProcessor::Create( + normalizer_.get(), index_.get(), CreateIndexProcessorOptions(options_), + clock_.get()); if (!index_processor_or.ok()) { TransformStatus(index_processor_or.status(), result_status); put_document_stats->set_latency_ms(put_timer->GetElapsedMilliseconds()); @@ -853,17 +695,6 @@ PutResultProto IcingSearchEngine::Put(DocumentProto&& document) { auto status = index_processor->IndexDocument(tokenized_document, document_id, put_document_stats); - if (!status.ok()) { - // If we encountered a failure while indexing this document, then mark it as - // deleted. - libtextclassifier3::Status delete_status = - document_store_->Delete(document_id); - if (!delete_status.ok()) { - // This is pretty dire (and, hopefully, unlikely). We can't roll back the - // document that we just added. Wipeout the whole index. - ResetInternal(); - } - } TransformStatus(status, result_status); put_document_stats->set_latency_ms(put_timer->GetElapsedMilliseconds()); @@ -972,7 +803,7 @@ DeleteResultProto IcingSearchEngine::Delete(const std::string_view name_space, delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SINGLE); std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer(); - // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = document_store_->Delete(name_space, uri); if (!status.ok()) { @@ -1006,7 +837,7 @@ DeleteByNamespaceResultProto IcingSearchEngine::DeleteByNamespace( delete_stats->set_delete_type(DeleteStatsProto::DeleteType::NAMESPACE); std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer(); - // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. DocumentStore::DeleteByGroupResult doc_store_result = document_store_->DeleteByNamespace(name_space); @@ -1040,7 +871,7 @@ DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType( delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SCHEMA_TYPE); std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer(); - // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. DocumentStore::DeleteByGroupResult doc_store_result = document_store_->DeleteBySchemaType(schema_type); @@ -1058,7 +889,7 @@ DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType( } DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery( - const SearchSpecProto& search_spec, bool return_deleted_document_info) { + const SearchSpecProto& search_spec) { ICING_VLOG(1) << "Deleting documents for query " << search_spec.query() << " from doc store"; @@ -1072,13 +903,9 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery( return result_proto; } - DeleteByQueryStatsProto* delete_stats = - result_proto.mutable_delete_by_query_stats(); - delete_stats->set_query_length(search_spec.query().length()); - delete_stats->set_num_namespaces_filtered( - search_spec.namespace_filters_size()); - delete_stats->set_num_schema_types_filtered( - search_spec.schema_type_filters_size()); + DeleteStatsProto* delete_stats = result_proto.mutable_delete_stats(); + delete_stats->set_delete_type(DeleteStatsProto::DeleteType::QUERY); + std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer(); libtextclassifier3::Status status = @@ -1088,7 +915,6 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery( return result_proto; } - std::unique_ptr<Timer> component_timer = clock_->GetNewTimer(); // Gets unordered results from query processor auto query_processor_or = QueryProcessor::Create( index_.get(), language_segmenter_.get(), normalizer_.get(), @@ -1107,32 +933,14 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery( } QueryProcessor::QueryResults query_results = std::move(query_results_or).ValueOrDie(); - delete_stats->set_parse_query_latency_ms( - component_timer->GetElapsedMilliseconds()); ICING_VLOG(2) << "Deleting the docs that matched the query."; int num_deleted = 0; - // A map used to group deleted documents. - // From the (namespace, type) pair to a list of uris. - std::unordered_map<NamespaceTypePair, - DeleteByQueryResultProto::DocumentGroupInfo*, - NamespaceTypePairHasher> - deleted_info_map; - component_timer = clock_->GetNewTimer(); while (query_results.root_iterator->Advance().ok()) { ICING_VLOG(3) << "Deleting doc " << query_results.root_iterator->doc_hit_info().document_id(); ++num_deleted; - if (return_deleted_document_info) { - status = RetrieveAndAddDocumentInfo( - document_store_.get(), result_proto, deleted_info_map, - query_results.root_iterator->doc_hit_info().document_id()); - if (!status.ok()) { - TransformStatus(status, result_status); - return result_proto; - } - } status = document_store_->Delete( query_results.root_iterator->doc_hit_info().document_id()); if (!status.ok()) { @@ -1140,13 +948,6 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery( return result_proto; } } - delete_stats->set_document_removal_latency_ms( - component_timer->GetElapsedMilliseconds()); - int term_count = 0; - for (const auto& section_and_terms : query_results.query_terms) { - term_count += section_and_terms.second.size(); - } - delete_stats->set_num_terms(term_count); if (num_deleted > 0) { result_proto.mutable_status()->set_code(StatusProto::OK); @@ -1201,8 +1002,12 @@ OptimizeResultProto IcingSearchEngine::Optimize() { std::unique_ptr<Timer> optimize_timer = clock_->GetNewTimer(); OptimizeStatsProto* optimize_stats = result_proto.mutable_optimize_stats(); int64_t before_size = filesystem_->GetDiskUsage(options_.base_dir().c_str()); - optimize_stats->set_storage_size_before( - Filesystem::SanitizeFileSize(before_size)); + if (before_size != Filesystem::kBadFileSize) { + optimize_stats->set_storage_size_before(before_size); + } else { + // Set -1 as a sentinel value when failures occur. + optimize_stats->set_storage_size_before(-1); + } // Flushes data to disk before doing optimization auto status = InternalPersistToDisk(PersistType::FULL); @@ -1279,8 +1084,12 @@ OptimizeResultProto IcingSearchEngine::Optimize() { optimize_status_file.Write(std::move(optimize_status)); int64_t after_size = filesystem_->GetDiskUsage(options_.base_dir().c_str()); - optimize_stats->set_storage_size_after( - Filesystem::SanitizeFileSize(after_size)); + if (after_size != Filesystem::kBadFileSize) { + optimize_stats->set_storage_size_after(after_size); + } else { + // Set -1 as a sentinel value when failures occur. + optimize_stats->set_storage_size_after(-1); + } optimize_stats->set_latency_ms(optimize_timer->GetElapsedMilliseconds()); TransformStatus(optimization_status, result_status); @@ -1362,8 +1171,11 @@ StorageInfoResultProto IcingSearchEngine::GetStorageInfo() { } int64_t index_size = filesystem_->GetDiskUsage(options_.base_dir().c_str()); - result.mutable_storage_info()->set_total_storage_size( - Filesystem::SanitizeFileSize(index_size)); + if (index_size != Filesystem::kBadFileSize) { + result.mutable_storage_info()->set_total_storage_size(index_size); + } else { + result.mutable_storage_info()->set_total_storage_size(-1); + } *result.mutable_storage_info()->mutable_document_storage_info() = document_store_->GetStorageInfo(); *result.mutable_storage_info()->mutable_schema_store_storage_info() = @@ -1453,8 +1265,8 @@ SearchResultProto IcingSearchEngine::Search( component_timer = clock_->GetNewTimer(); // Scores but does not rank the results. libtextclassifier3::StatusOr<std::unique_ptr<ScoringProcessor>> - scoring_processor_or = ScoringProcessor::Create( - scoring_spec, document_store_.get(), schema_store_.get()); + scoring_processor_or = + ScoringProcessor::Create(scoring_spec, document_store_.get()); if (!scoring_processor_or.ok()) { TransformStatus(scoring_processor_or.status(), result_status); return result_proto; @@ -1765,8 +1577,9 @@ IcingSearchEngine::RestoreIndexIfNeeded() { return {libtextclassifier3::Status::OK, false}; } - auto index_processor_or = - IndexProcessor::Create(normalizer_.get(), index_.get(), clock_.get()); + auto index_processor_or = IndexProcessor::Create( + normalizer_.get(), index_.get(), CreateIndexProcessorOptions(options_), + clock_.get()); if (!index_processor_or.ok()) { return {index_processor_or.status(), true}; } @@ -1844,18 +1657,22 @@ libtextclassifier3::StatusOr<bool> IcingSearchEngine::LostPreviousSchema() { } ResetResultProto IcingSearchEngine::Reset() { - absl_ports::unique_lock l(&mutex_); - return ResetInternal(); -} - -ResetResultProto IcingSearchEngine::ResetInternal() { ICING_VLOG(1) << "Resetting IcingSearchEngine"; ResetResultProto result_proto; StatusProto* result_status = result_proto.mutable_status(); + absl_ports::unique_lock l(&mutex_); + initialized_ = false; - ResetMembers(); + + // Resets members variables + schema_store_.reset(); + document_store_.reset(); + language_segmenter_.reset(); + normalizer_.reset(); + index_.reset(); + if (!filesystem_->DeleteDirectoryRecursively(options_.base_dir().c_str())) { result_status->set_code(StatusProto::INTERNAL); return result_proto; @@ -1881,65 +1698,5 @@ ResetResultProto IcingSearchEngine::ResetInternal() { return result_proto; } -SuggestionResponse IcingSearchEngine::SearchSuggestions( - const SuggestionSpecProto& suggestion_spec) { - // TODO(b/146008613) Explore ideas to make this function read-only. - absl_ports::unique_lock l(&mutex_); - SuggestionResponse response; - StatusProto* response_status = response.mutable_status(); - if (!initialized_) { - response_status->set_code(StatusProto::FAILED_PRECONDITION); - response_status->set_message("IcingSearchEngine has not been initialized!"); - return response; - } - - libtextclassifier3::Status status = - ValidateSuggestionSpec(suggestion_spec, performance_configuration_); - if (!status.ok()) { - TransformStatus(status, response_status); - return response; - } - - // Create the suggestion processor. - auto suggestion_processor_or = SuggestionProcessor::Create( - index_.get(), language_segmenter_.get(), normalizer_.get()); - if (!suggestion_processor_or.ok()) { - TransformStatus(suggestion_processor_or.status(), response_status); - return response; - } - std::unique_ptr<SuggestionProcessor> suggestion_processor = - std::move(suggestion_processor_or).ValueOrDie(); - - std::unordered_set<NamespaceId> namespace_ids; - namespace_ids.reserve(suggestion_spec.namespace_filters_size()); - for (std::string_view name_space : suggestion_spec.namespace_filters()) { - auto namespace_id_or = document_store_->GetNamespaceId(name_space); - if (!namespace_id_or.ok()) { - continue; - } - namespace_ids.insert(namespace_id_or.ValueOrDie()); - } - - // Run suggestion based on given SuggestionSpec. - NamespaceCheckerImpl namespace_checker_impl(document_store_.get(), - std::move(namespace_ids)); - libtextclassifier3::StatusOr<std::vector<TermMetadata>> terms_or = - suggestion_processor->QuerySuggestions(suggestion_spec, - &namespace_checker_impl); - if (!terms_or.ok()) { - TransformStatus(terms_or.status(), response_status); - return response; - } - - // Convert vector<TermMetaData> into final SuggestionResponse proto. - for (TermMetadata& term : terms_or.ValueOrDie()) { - SuggestionResponse::Suggestion suggestion; - suggestion.set_query(std::move(term.content)); - response.mutable_suggestions()->Add(std::move(suggestion)); - } - response_status->set_code(StatusProto::OK); - return response; -} - } // namespace lib } // namespace icing diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h index ff9c7fb..855401f 100644 --- a/icing/icing-search-engine.h +++ b/icing/icing-search-engine.h @@ -280,9 +280,8 @@ class IcingSearchEngine { // NOT_FOUND if the query doesn't match any documents // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet // INTERNAL_ERROR on IO error - DeleteByQueryResultProto DeleteByQuery( - const SearchSpecProto& search_spec, - bool return_deleted_document_info = false) ICING_LOCKS_EXCLUDED(mutex_); + DeleteByQueryResultProto DeleteByQuery(const SearchSpecProto& search_spec) + ICING_LOCKS_EXCLUDED(mutex_); // Retrieves, scores, ranks, and returns the results according to the specs. // Results can be empty. If there're multiple pages of results, @@ -303,17 +302,6 @@ class IcingSearchEngine { const ResultSpecProto& result_spec) ICING_LOCKS_EXCLUDED(mutex_); - // Retrieves, scores, ranks and returns the suggested query string according - // to the specs. Results can be empty. - // - // Returns a SuggestionResponse with status: - // OK with results on success - // INVALID_ARGUMENT if any of specs is invalid - // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet - // INTERNAL_ERROR on any other errors - SuggestionResponse SearchSuggestions( - const SuggestionSpecProto& suggestion_spec) ICING_LOCKS_EXCLUDED(mutex_); - // Fetches the next page of results of a previously executed query. Results // can be empty if next-page token is invalid. Invalid next page tokens are // tokens that are either zero or were previously passed to @@ -464,25 +452,6 @@ class IcingSearchEngine { // Pointer to JNI class references const std::unique_ptr<const JniCache> jni_cache_; - // Resets all members that are created during Initialize. - void ResetMembers() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); - - // Resets all members that are created during Initialize, deletes all - // underlying files and initializes a fresh index. - ResetResultProto ResetInternal() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); - - // Checks for the existence of the init marker file. If the failed init count - // exceeds kMaxUnsuccessfulInitAttempts, all data is deleted and the index is - // initialized from scratch. The updated count (original failed init count + 1 - // ) is written to the marker file. - // - // RETURNS - // OK on success - // INTERNAL if an IO error occurs while trying to update the marker file. - libtextclassifier3::Status CheckInitMarkerFile( - InitializeStatsProto* initialize_stats) - ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); - // Helper method to do the actual work to persist data to disk. We need this // separate method so that other public methods don't need to call // PersistToDisk(). Public methods calling each other may cause deadlock @@ -508,6 +477,15 @@ class IcingSearchEngine { InitializeStatsProto* initialize_stats) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + // Do any validation/setup required for the given IcingSearchEngineOptions + // + // Returns: + // OK on success + // INVALID_ARGUMENT if options has invalid values + // INTERNAL on I/O error + libtextclassifier3::Status InitializeOptions() + ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + // Do any initialization/recovery necessary to create a SchemaStore instance. // // Returns: diff --git a/icing/icing-search-engine_benchmark.cc b/icing/icing-search-engine_benchmark.cc index 5e610d5..ba9aed1 100644 --- a/icing/icing-search-engine_benchmark.cc +++ b/icing/icing-search-engine_benchmark.cc @@ -43,6 +43,7 @@ #include "icing/testing/common-matchers.h" #include "icing/testing/document-generator.h" #include "icing/testing/random-string.h" +#include "icing/testing/recorder-test-utils.h" #include "icing/testing/schema-generator.h" #include "icing/testing/tmp-directory.h" @@ -177,12 +178,12 @@ class DestructibleDirectory { }; std::vector<DocumentProto> GenerateRandomDocuments( - EvenDistributionTypeSelector* type_selector, int num_docs, - const std::vector<std::string>& language) { + EvenDistributionTypeSelector* type_selector, int num_docs) { std::vector<std::string> namespaces = CreateNamespaces(kAvgNumNamespaces); EvenDistributionNamespaceSelector namespace_selector(namespaces); std::default_random_engine random; + std::vector<std::string> language = CreateLanguages(kLanguageSize, &random); UniformDistributionLanguageTokenGenerator<std::default_random_engine> token_generator(language, &random); @@ -226,9 +227,8 @@ void BM_IndexLatency(benchmark::State& state) { ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk()); int num_docs = state.range(0); - std::vector<std::string> language = CreateLanguages(kLanguageSize, &random); const std::vector<DocumentProto> random_docs = - GenerateRandomDocuments(&type_selector, num_docs, language); + GenerateRandomDocuments(&type_selector, num_docs); Timer timer; for (const DocumentProto& doc : random_docs) { ASSERT_THAT(icing->Put(doc).status(), ProtoIsOk()); @@ -271,56 +271,6 @@ BENCHMARK(BM_IndexLatency) ->ArgPair(1 << 15, 10) ->ArgPair(1 << 17, 10); -void BM_QueryLatency(benchmark::State& state) { - // Initialize the filesystem - std::string test_dir = GetTestTempDir() + "/icing/benchmark"; - Filesystem filesystem; - DestructibleDirectory ddir(filesystem, test_dir); - - // Create the schema. - std::default_random_engine random; - int num_types = kAvgNumNamespaces * kAvgNumTypes; - ExactStringPropertyGenerator property_generator; - SchemaGenerator<ExactStringPropertyGenerator> schema_generator( - /*num_properties=*/state.range(1), &property_generator); - SchemaProto schema = schema_generator.GenerateSchema(num_types); - EvenDistributionTypeSelector type_selector(schema); - - // Create the index. - IcingSearchEngineOptions options; - options.set_base_dir(test_dir); - options.set_index_merge_size(kIcingFullIndexSize); - std::unique_ptr<IcingSearchEngine> icing = - std::make_unique<IcingSearchEngine>(options); - - ASSERT_THAT(icing->Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk()); - - int num_docs = state.range(0); - std::vector<std::string> language = CreateLanguages(kLanguageSize, &random); - const std::vector<DocumentProto> random_docs = - GenerateRandomDocuments(&type_selector, num_docs, language); - for (const DocumentProto& doc : random_docs) { - ASSERT_THAT(icing->Put(doc).status(), ProtoIsOk()); - } - - SearchSpecProto search_spec = CreateSearchSpec( - language.at(0), std::vector<std::string>(), TermMatchType::PREFIX); - ResultSpecProto result_spec = CreateResultSpec(1000000, 1000000, 1000000); - ScoringSpecProto scoring_spec = - CreateScoringSpec(ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP); - for (auto _ : state) { - SearchResultProto results = icing->Search( - search_spec, ScoringSpecProto::default_instance(), result_spec); - } -} -BENCHMARK(BM_QueryLatency) - // Arguments: num_indexed_documents, num_sections - ->ArgPair(32, 2) - ->ArgPair(128, 2) - ->ArgPair(1 << 10, 2) - ->ArgPair(1 << 13, 2); - void BM_IndexThroughput(benchmark::State& state) { // Initialize the filesystem std::string test_dir = GetTestTempDir() + "/icing/benchmark"; @@ -347,9 +297,8 @@ void BM_IndexThroughput(benchmark::State& state) { ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk()); int num_docs = state.range(0); - std::vector<std::string> language = CreateLanguages(kLanguageSize, &random); const std::vector<DocumentProto> random_docs = - GenerateRandomDocuments(&type_selector, num_docs, language); + GenerateRandomDocuments(&type_selector, num_docs); for (auto s : state) { for (const DocumentProto& doc : random_docs) { ASSERT_THAT(icing->Put(doc).status(), ProtoIsOk()); diff --git a/icing/icing-search-engine_fuzz_test.cc b/icing/icing-search-engine_fuzz_test.cc index bf486da..2d07e37 100644 --- a/icing/icing-search-engine_fuzz_test.cc +++ b/icing/icing-search-engine_fuzz_test.cc @@ -18,12 +18,12 @@ #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/document-builder.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/icing-search-engine.h" #include "icing/proto/document.pb.h" #include "icing/proto/initialize.pb.h" #include "icing/proto/scoring.pb.h" #include "icing/schema-builder.h" -#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" @@ -31,13 +31,13 @@ namespace icing { namespace lib { namespace { -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED = - PropertyConfigProto::Cardinality::REQUIRED; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED = + PropertyConfigProto_Cardinality_Code_REQUIRED; -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = + StringIndexingConfig_TokenizerType_Code_PLAIN; -constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; +constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; IcingSearchEngineOptions Setup() { IcingSearchEngineOptions icing_options; diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc index 13e77b8..4c15827 100644 --- a/icing/icing-search-engine_test.cc +++ b/icing/icing-search-engine_test.cc @@ -27,8 +27,8 @@ #include "icing/document-builder.h" #include "icing/file/filesystem.h" #include "icing/file/mock-filesystem.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/legacy/index/icing-mock-filesystem.h" -#include "icing/portable/endian.h" #include "icing/portable/equals-proto.h" #include "icing/portable/platform.h" #include "icing/proto/document.pb.h" @@ -45,7 +45,6 @@ #include "icing/store/document-log-creator.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" -#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/jni-test-helpers.h" #include "icing/testing/random-string.h" #include "icing/testing/snippet-helpers.h" @@ -90,24 +89,21 @@ constexpr std::string_view kIpsumText = "vehicula posuere vitae, convallis eu lorem. Donec semper augue eu nibh " "placerat semper."; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED = - PropertyConfigProto::Cardinality::REQUIRED; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED = - PropertyConfigProto::Cardinality::REPEATED; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = + PropertyConfigProto_Cardinality_Code_OPTIONAL; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED = + PropertyConfigProto_Cardinality_Code_REQUIRED; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED = + PropertyConfigProto_Cardinality_Code_REPEATED; -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_NONE = - StringIndexingConfig::TokenizerType::NONE; +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = + StringIndexingConfig_TokenizerType_Code_PLAIN; +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_NONE = + StringIndexingConfig_TokenizerType_Code_NONE; -#ifndef ICING_JNI_TEST -constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; -#endif // !ICING_JNI_TEST - -constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; -constexpr TermMatchType::Code MATCH_NONE = TermMatchType::UNKNOWN; +constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; +constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; +constexpr TermMatchType_Code MATCH_NONE = TermMatchType_Code_UNKNOWN; PortableFileBackedProtoLog<DocumentWrapper>::Header ReadDocumentLogHeader( Filesystem filesystem, const std::string& file_path) { @@ -362,6 +358,36 @@ TEST_F(IcingSearchEngineTest, GoodIndexMergeSizeReturnsOk) { EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); } +TEST_F(IcingSearchEngineTest, + NegativeMaxTokensPerDocSizeReturnsInvalidArgument) { + IcingSearchEngineOptions options = GetDefaultIcingOptions(); + options.set_max_tokens_per_doc(-1); + IcingSearchEngine icing(options, GetTestJniCache()); + EXPECT_THAT(icing.Initialize().status(), + ProtoStatusIs(StatusProto::INVALID_ARGUMENT)); +} + +TEST_F(IcingSearchEngineTest, ZeroMaxTokensPerDocSizeReturnsInvalidArgument) { + IcingSearchEngineOptions options = GetDefaultIcingOptions(); + options.set_max_tokens_per_doc(0); + IcingSearchEngine icing(options, GetTestJniCache()); + EXPECT_THAT(icing.Initialize().status(), + ProtoStatusIs(StatusProto::INVALID_ARGUMENT)); +} + +TEST_F(IcingSearchEngineTest, GoodMaxTokensPerDocSizeReturnsOk) { + IcingSearchEngineOptions options = GetDefaultIcingOptions(); + // INT_MAX is valid - it just means that we shouldn't limit the number of + // tokens per document. It would be pretty inconceivable that anyone would + // produce such a document - the text being indexed alone would take up at + // least ~4.3 GiB! - and the document would be rejected before indexing + // for exceeding max_document_size, but there's no reason to explicitly + // bar it. + options.set_max_tokens_per_doc(std::numeric_limits<int32_t>::max()); + IcingSearchEngine icing(options, GetTestJniCache()); + EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); +} + TEST_F(IcingSearchEngineTest, NegativeMaxTokenLenReturnsInvalidArgument) { IcingSearchEngineOptions options = GetDefaultIcingOptions(); options.set_max_token_length(-1); @@ -478,217 +504,6 @@ TEST_F(IcingSearchEngineTest, FailToCreateDocStore) { HasSubstr("Could not create directory")); } -TEST_F(IcingSearchEngineTest, InitMarkerFilePreviousFailuresAtThreshold) { - Filesystem filesystem; - DocumentProto email1 = - CreateEmailDocument("namespace", "uri1", 100, "subject1", "body1"); - email1.set_creation_timestamp_ms(10000); - DocumentProto email2 = - CreateEmailDocument("namespace", "uri2", 50, "subject2", "body2"); - email2.set_creation_timestamp_ms(10000); - - { - // Create an index with a few documents. - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); - InitializeResultProto init_result = icing.Initialize(); - ASSERT_THAT(init_result.status(), ProtoIsOk()); - ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(), - Eq(0)); - ASSERT_THAT(icing.SetSchema(CreateEmailSchema()).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(email1).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(email2).status(), ProtoIsOk()); - } - - // Write an init marker file with 5 previously failed attempts. - std::string marker_filepath = GetTestBaseDir() + "/init_marker"; - - { - ScopedFd marker_file_fd(filesystem.OpenForWrite(marker_filepath.c_str())); - int network_init_attempts = GHostToNetworkL(5); - // Write the updated number of attempts before we get started. - ASSERT_TRUE(filesystem.PWrite(marker_file_fd.get(), 0, - &network_init_attempts, - sizeof(network_init_attempts))); - ASSERT_TRUE(filesystem.DataSync(marker_file_fd.get())); - } - - { - // Create the index again and verify that initialization succeeds and no - // data is thrown out. - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); - InitializeResultProto init_result = icing.Initialize(); - ASSERT_THAT(init_result.status(), ProtoIsOk()); - ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(), - Eq(5)); - EXPECT_THAT( - icing.Get("namespace", "uri1", GetResultSpecProto::default_instance()) - .document(), - EqualsProto(email1)); - EXPECT_THAT( - icing.Get("namespace", "uri2", GetResultSpecProto::default_instance()) - .document(), - EqualsProto(email2)); - } - - // The successful init should have thrown out the marker file. - ASSERT_FALSE(filesystem.FileExists(marker_filepath.c_str())); -} - -TEST_F(IcingSearchEngineTest, InitMarkerFilePreviousFailuresBeyondThreshold) { - Filesystem filesystem; - DocumentProto email1 = - CreateEmailDocument("namespace", "uri1", 100, "subject1", "body1"); - DocumentProto email2 = - CreateEmailDocument("namespace", "uri2", 50, "subject2", "body2"); - - { - // Create an index with a few documents. - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); - InitializeResultProto init_result = icing.Initialize(); - ASSERT_THAT(init_result.status(), ProtoIsOk()); - ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(), - Eq(0)); - ASSERT_THAT(icing.SetSchema(CreateEmailSchema()).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(email1).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(email2).status(), ProtoIsOk()); - } - - // Write an init marker file with 6 previously failed attempts. - std::string marker_filepath = GetTestBaseDir() + "/init_marker"; - - { - ScopedFd marker_file_fd(filesystem.OpenForWrite(marker_filepath.c_str())); - int network_init_attempts = GHostToNetworkL(6); - // Write the updated number of attempts before we get started. - ASSERT_TRUE(filesystem.PWrite(marker_file_fd.get(), 0, - &network_init_attempts, - sizeof(network_init_attempts))); - ASSERT_TRUE(filesystem.DataSync(marker_file_fd.get())); - } - - { - // Create the index again and verify that initialization succeeds and all - // data is thrown out. - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); - InitializeResultProto init_result = icing.Initialize(); - ASSERT_THAT(init_result.status(), - ProtoStatusIs(StatusProto::WARNING_DATA_LOSS)); - ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(), - Eq(6)); - EXPECT_THAT( - icing.Get("namespace", "uri1", GetResultSpecProto::default_instance()) - .status(), - ProtoStatusIs(StatusProto::NOT_FOUND)); - EXPECT_THAT( - icing.Get("namespace", "uri2", GetResultSpecProto::default_instance()) - .status(), - ProtoStatusIs(StatusProto::NOT_FOUND)); - } - - // The successful init should have thrown out the marker file. - ASSERT_FALSE(filesystem.FileExists(marker_filepath.c_str())); -} - -TEST_F(IcingSearchEngineTest, SuccessiveInitFailuresIncrementsInitMarker) { - Filesystem filesystem; - DocumentProto email1 = - CreateEmailDocument("namespace", "uri1", 100, "subject1", "body1"); - DocumentProto email2 = - CreateEmailDocument("namespace", "uri2", 50, "subject2", "body2"); - - { - // 1. Create an index with a few documents. - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); - InitializeResultProto init_result = icing.Initialize(); - ASSERT_THAT(init_result.status(), ProtoIsOk()); - ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(), - Eq(0)); - ASSERT_THAT(icing.SetSchema(CreateEmailSchema()).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(email1).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(email2).status(), ProtoIsOk()); - } - - { - // 2. Create an index that will encounter an IO failure when trying to - // create the document log. - IcingSearchEngineOptions icing_options = GetDefaultIcingOptions(); - - auto mock_filesystem = std::make_unique<MockFilesystem>(); - std::string document_log_filepath = - icing_options.base_dir() + "/document_dir/document_log_v1"; - auto get_filesize_lambda = [this, - &document_log_filepath](const char* filename) { - if (strncmp(document_log_filepath.c_str(), filename, - document_log_filepath.length()) == 0) { - return Filesystem::kBadFileSize; - } - return this->filesystem()->GetFileSize(filename); - }; - ON_CALL(*mock_filesystem, GetFileSize(A<const char*>())) - .WillByDefault(get_filesize_lambda); - - TestIcingSearchEngine icing(icing_options, std::move(mock_filesystem), - std::make_unique<IcingFilesystem>(), - std::make_unique<FakeClock>(), - GetTestJniCache()); - - // Fail to initialize six times in a row. - InitializeResultProto init_result = icing.Initialize(); - ASSERT_THAT(init_result.status(), ProtoStatusIs(StatusProto::INTERNAL)); - ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(), - Eq(0)); - - init_result = icing.Initialize(); - ASSERT_THAT(init_result.status(), ProtoStatusIs(StatusProto::INTERNAL)); - ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(), - Eq(1)); - - init_result = icing.Initialize(); - ASSERT_THAT(init_result.status(), ProtoStatusIs(StatusProto::INTERNAL)); - ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(), - Eq(2)); - - init_result = icing.Initialize(); - ASSERT_THAT(init_result.status(), ProtoStatusIs(StatusProto::INTERNAL)); - ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(), - Eq(3)); - - init_result = icing.Initialize(); - ASSERT_THAT(init_result.status(), ProtoStatusIs(StatusProto::INTERNAL)); - ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(), - Eq(4)); - - init_result = icing.Initialize(); - ASSERT_THAT(init_result.status(), ProtoStatusIs(StatusProto::INTERNAL)); - ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(), - Eq(5)); - } - - { - // 3. Create the index again and verify that initialization succeeds and all - // data is thrown out. - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); - InitializeResultProto init_result = icing.Initialize(); - ASSERT_THAT(init_result.status(), - ProtoStatusIs(StatusProto::WARNING_DATA_LOSS)); - ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(), - Eq(6)); - - EXPECT_THAT( - icing.Get("namespace", "uri1", GetResultSpecProto::default_instance()) - .status(), - ProtoStatusIs(StatusProto::NOT_FOUND)); - EXPECT_THAT( - icing.Get("namespace", "uri2", GetResultSpecProto::default_instance()) - .status(), - ProtoStatusIs(StatusProto::NOT_FOUND)); - } - - // The successful init should have thrown out the marker file. - std::string marker_filepath = GetTestBaseDir() + "/init_marker"; - ASSERT_FALSE(filesystem.FileExists(marker_filepath.c_str())); -} - TEST_F(IcingSearchEngineTest, CircularReferenceCreateSectionManagerReturnsInvalidArgument) { // Create a type config with a circular reference. @@ -765,7 +580,8 @@ TEST_F(IcingSearchEngineTest, FailToWriteSchema) { auto mock_filesystem = std::make_unique<MockFilesystem>(); // This fails FileBackedProto::Write() - ON_CALL(*mock_filesystem, OpenForWrite(HasSubstr("schema.pb"))) + ON_CALL(*mock_filesystem, + OpenForWrite(Eq(icing_options.base_dir() + "/schema_dir/schema.pb"))) .WillByDefault(Return(-1)); TestIcingSearchEngine icing(icing_options, std::move(mock_filesystem), @@ -922,13 +738,7 @@ TEST_F(IcingSearchEngineTest, SetSchemaCompatibleVersionUpdateSucceeds) { property->set_data_type(PropertyConfigProto::DataType::STRING); property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - SetSchemaResultProto set_schema_result = icing.SetSchema(schema); - // Ignore latency numbers. They're covered elsewhere. - set_schema_result.clear_latency_ms(); - SetSchemaResultProto expected_set_schema_result; - expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); - expected_set_schema_result.mutable_new_schema_types()->Add("Email"); - EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); EXPECT_THAT(icing.GetSchema().schema().types(0).version(), Eq(1)); } @@ -946,20 +756,12 @@ TEST_F(IcingSearchEngineTest, SetSchemaCompatibleVersionUpdateSucceeds) { property->set_property_name("title"); property->set_data_type(PropertyConfigProto::DataType::STRING); property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - property = type->add_properties(); property->set_property_name("body"); property->set_data_type(PropertyConfigProto::DataType::STRING); property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); // 3. SetSchema should succeed and the version number should be updated. - SetSchemaResultProto set_schema_result = icing.SetSchema(schema, true); - // Ignore latency numbers. They're covered elsewhere. - set_schema_result.clear_latency_ms(); - SetSchemaResultProto expected_set_schema_result; - expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); - expected_set_schema_result.mutable_fully_compatible_changed_schema_types() - ->Add("Email"); - EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + EXPECT_THAT(icing.SetSchema(schema, true).status(), ProtoIsOk()); EXPECT_THAT(icing.GetSchema().schema().types(0).version(), Eq(2)); } @@ -1145,12 +947,7 @@ TEST_F(IcingSearchEngineTest, } TEST_F(IcingSearchEngineTest, SetSchema) { - auto fake_clock = std::make_unique<FakeClock>(); - fake_clock->SetTimerElapsedMilliseconds(1000); - TestIcingSearchEngine icing(GetDefaultIcingOptions(), - std::make_unique<Filesystem>(), - std::make_unique<IcingFilesystem>(), - std::move(fake_clock), GetTestJniCache()); + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); auto message_document = CreateMessageDocument("namespace", "uri"); @@ -1179,31 +976,26 @@ TEST_F(IcingSearchEngineTest, SetSchema) { empty_type->set_schema_type(""); // Make sure we can't set invalid schemas - SetSchemaResultProto set_schema_result = icing.SetSchema(invalid_schema); - EXPECT_THAT(set_schema_result.status(), + EXPECT_THAT(icing.SetSchema(invalid_schema).status(), ProtoStatusIs(StatusProto::INVALID_ARGUMENT)); - EXPECT_THAT(set_schema_result.latency_ms(), Eq(1000)); // Can add an document of a set schema - set_schema_result = icing.SetSchema(schema_with_message); - EXPECT_THAT(set_schema_result.status(), ProtoStatusIs(StatusProto::OK)); - EXPECT_THAT(set_schema_result.latency_ms(), Eq(1000)); + EXPECT_THAT(icing.SetSchema(schema_with_message).status(), ProtoIsOk()); EXPECT_THAT(icing.Put(message_document).status(), ProtoIsOk()); // Schema with Email doesn't have Message, so would result incompatible // data - set_schema_result = icing.SetSchema(schema_with_email); - EXPECT_THAT(set_schema_result.status(), + EXPECT_THAT(icing.SetSchema(schema_with_email).status(), ProtoStatusIs(StatusProto::FAILED_PRECONDITION)); - EXPECT_THAT(set_schema_result.latency_ms(), Eq(1000)); // Can expand the set of schema types and add an document of a new // schema type - set_schema_result = icing.SetSchema(schema_with_email_and_message); - EXPECT_THAT(set_schema_result.status(), ProtoStatusIs(StatusProto::OK)); - EXPECT_THAT(set_schema_result.latency_ms(), Eq(1000)); - + EXPECT_THAT(icing.SetSchema(SchemaProto(schema_with_email_and_message)) + .status() + .code(), + Eq(StatusProto::OK)); EXPECT_THAT(icing.Put(message_document).status(), ProtoIsOk()); + // Can't add an document whose schema isn't set auto photo_document = DocumentBuilder() .SetKey("namespace", "uri") @@ -1217,7 +1009,7 @@ TEST_F(IcingSearchEngineTest, SetSchema) { } TEST_F(IcingSearchEngineTest, - SetSchemaNewIndexedPropertyTriggersIndexRestorationAndReturnsOk) { + SetSchemaTriggersIndexRestorationAndReturnsOk) { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); @@ -1226,15 +1018,8 @@ TEST_F(IcingSearchEngineTest, ->mutable_properties(0) ->clear_string_indexing_config(); - SetSchemaResultProto set_schema_result = - icing.SetSchema(schema_with_no_indexed_property); - // Ignore latency numbers. They're covered elsewhere. - set_schema_result.clear_latency_ms(); - SetSchemaResultProto expected_set_schema_result; - expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); - expected_set_schema_result.mutable_new_schema_types()->Add("Message"); - EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); - + EXPECT_THAT(icing.SetSchema(schema_with_no_indexed_property).status(), + ProtoIsOk()); // Nothing will be index and Search() won't return anything. EXPECT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(), ProtoIsOk()); @@ -1255,14 +1040,8 @@ TEST_F(IcingSearchEngineTest, SchemaProto schema_with_indexed_property = CreateMessageSchema(); // Index restoration should be triggered here because new schema requires more // properties to be indexed. - set_schema_result = icing.SetSchema(schema_with_indexed_property); - // Ignore latency numbers. They're covered elsewhere. - set_schema_result.clear_latency_ms(); - expected_set_schema_result = SetSchemaResultProto(); - expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); - expected_set_schema_result.mutable_index_incompatible_changed_schema_types() - ->Add("Message"); - EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + EXPECT_THAT(icing.SetSchema(schema_with_indexed_property).status(), + ProtoIsOk()); SearchResultProto expected_search_result_proto; expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); @@ -1306,12 +1085,8 @@ TEST_F(IcingSearchEngineTest, .Build(); SetSchemaResultProto set_schema_result = icing.SetSchema(nested_schema); - // Ignore latency numbers. They're covered elsewhere. - set_schema_result.clear_latency_ms(); SetSchemaResultProto expected_set_schema_result; expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); - expected_set_schema_result.mutable_new_schema_types()->Add("Email"); - expected_set_schema_result.mutable_new_schema_types()->Add("Person"); EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); DocumentProto document = @@ -1378,12 +1153,8 @@ TEST_F(IcingSearchEngineTest, .Build(); set_schema_result = icing.SetSchema(no_nested_schema); - // Ignore latency numbers. They're covered elsewhere. - set_schema_result.clear_latency_ms(); expected_set_schema_result = SetSchemaResultProto(); expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); - expected_set_schema_result.mutable_index_incompatible_changed_schema_types() - ->Add("Email"); EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); // document shouldn't match a query for 'Bill' in either 'sender.name' or @@ -1426,10 +1197,7 @@ TEST_F(IcingSearchEngineTest, SetSchemaResultProto set_schema_result = icing.SetSchema(email_with_body_schema); - // Ignore latency numbers. They're covered elsewhere. - set_schema_result.clear_latency_ms(); SetSchemaResultProto expected_set_schema_result; - expected_set_schema_result.mutable_new_schema_types()->Add("Email"); expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); @@ -1475,12 +1243,8 @@ TEST_F(IcingSearchEngineTest, set_schema_result = icing.SetSchema( email_no_body_schema, /*ignore_errors_and_delete_documents=*/true); - // Ignore latency numbers. They're covered elsewhere. - set_schema_result.clear_latency_ms(); expected_set_schema_result = SetSchemaResultProto(); expected_set_schema_result.mutable_incompatible_schema_types()->Add("Email"); - expected_set_schema_result.mutable_index_incompatible_changed_schema_types() - ->Add("Email"); expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); @@ -1518,10 +1282,7 @@ TEST_F( SetSchemaResultProto set_schema_result = icing.SetSchema(email_with_body_schema); - // Ignore latency numbers. They're covered elsewhere. - set_schema_result.clear_latency_ms(); SetSchemaResultProto expected_set_schema_result; - expected_set_schema_result.mutable_new_schema_types()->Add("Email"); expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); @@ -1575,12 +1336,8 @@ TEST_F( set_schema_result = icing.SetSchema( email_no_body_schema, /*ignore_errors_and_delete_documents=*/true); - // Ignore latency numbers. They're covered elsewhere. - set_schema_result.clear_latency_ms(); expected_set_schema_result = SetSchemaResultProto(); expected_set_schema_result.mutable_incompatible_schema_types()->Add("Email"); - expected_set_schema_result.mutable_index_incompatible_changed_schema_types() - ->Add("Email"); expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); @@ -1628,11 +1385,7 @@ TEST_F(IcingSearchEngineTest, ForceSetSchemaIncompatibleNestedDocsAreDeleted) { .Build(); SetSchemaResultProto set_schema_result = icing.SetSchema(nested_schema); - // Ignore latency numbers. They're covered elsewhere. - set_schema_result.clear_latency_ms(); SetSchemaResultProto expected_set_schema_result; - expected_set_schema_result.mutable_new_schema_types()->Add("Email"); - expected_set_schema_result.mutable_new_schema_types()->Add("Person"); expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); @@ -1685,15 +1438,9 @@ TEST_F(IcingSearchEngineTest, ForceSetSchemaIncompatibleNestedDocsAreDeleted) { set_schema_result = icing.SetSchema( nested_schema, /*ignore_errors_and_delete_documents=*/true); - // Ignore latency numbers. They're covered elsewhere. - set_schema_result.clear_latency_ms(); expected_set_schema_result = SetSchemaResultProto(); expected_set_schema_result.mutable_incompatible_schema_types()->Add("Person"); expected_set_schema_result.mutable_incompatible_schema_types()->Add("Email"); - expected_set_schema_result.mutable_index_incompatible_changed_schema_types() - ->Add("Email"); - expected_set_schema_result.mutable_index_incompatible_changed_schema_types() - ->Add("Person"); expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); @@ -1752,10 +1499,6 @@ TEST_F(IcingSearchEngineTest, SetSchemaRevalidatesDocumentsAndReturnsOk) { property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); // Can't set the schema since it's incompatible - SetSchemaResultProto set_schema_result = - icing.SetSchema(schema_with_required_subject); - // Ignore latency numbers. They're covered elsewhere. - set_schema_result.clear_latency_ms(); SetSchemaResultProto expected_set_schema_result_proto; expected_set_schema_result_proto.mutable_status()->set_code( StatusProto::FAILED_PRECONDITION); @@ -1763,17 +1506,15 @@ TEST_F(IcingSearchEngineTest, SetSchemaRevalidatesDocumentsAndReturnsOk) { "Schema is incompatible."); expected_set_schema_result_proto.add_incompatible_schema_types("email"); - EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result_proto)); + EXPECT_THAT(icing.SetSchema(schema_with_required_subject), + EqualsProto(expected_set_schema_result_proto)); // Force set it - set_schema_result = - icing.SetSchema(schema_with_required_subject, - /*ignore_errors_and_delete_documents=*/true); - // Ignore latency numbers. They're covered elsewhere. - set_schema_result.clear_latency_ms(); expected_set_schema_result_proto.mutable_status()->set_code(StatusProto::OK); expected_set_schema_result_proto.mutable_status()->clear_message(); - EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result_proto)); + EXPECT_THAT(icing.SetSchema(schema_with_required_subject, + /*ignore_errors_and_delete_documents=*/true), + EqualsProto(expected_set_schema_result_proto)); GetResultProto expected_get_result_proto; expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); @@ -1830,25 +1571,19 @@ TEST_F(IcingSearchEngineTest, SetSchemaDeletesDocumentsAndReturnsOk) { type->set_schema_type("email"); // Can't set the schema since it's incompatible - SetSchemaResultProto set_schema_result = icing.SetSchema(new_schema); - // Ignore latency numbers. They're covered elsewhere. - set_schema_result.clear_latency_ms(); SetSchemaResultProto expected_result; expected_result.mutable_status()->set_code(StatusProto::FAILED_PRECONDITION); expected_result.mutable_status()->set_message("Schema is incompatible."); expected_result.add_deleted_schema_types("message"); - EXPECT_THAT(set_schema_result, EqualsProto(expected_result)); + EXPECT_THAT(icing.SetSchema(new_schema), EqualsProto(expected_result)); // Force set it - set_schema_result = - icing.SetSchema(new_schema, - /*ignore_errors_and_delete_documents=*/true); - // Ignore latency numbers. They're covered elsewhere. - set_schema_result.clear_latency_ms(); expected_result.mutable_status()->set_code(StatusProto::OK); expected_result.mutable_status()->clear_message(); - EXPECT_THAT(set_schema_result, EqualsProto(expected_result)); + EXPECT_THAT(icing.SetSchema(new_schema, + /*ignore_errors_and_delete_documents=*/true), + EqualsProto(expected_result)); // "email" document is still there GetResultProto expected_get_result_proto; @@ -2167,7 +1902,7 @@ TEST_F(IcingSearchEngineTest, SearchReturnsValidResults) { search_spec.set_query("message"); ResultSpecProto result_spec; - result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64); + result_spec.mutable_snippet_spec()->set_max_window_bytes(64); result_spec.mutable_snippet_spec()->set_num_matches_per_property(1); result_spec.mutable_snippet_spec()->set_num_to_snippet(1); @@ -2585,7 +2320,7 @@ TEST_F(IcingSearchEngineTest, ShouldReturnMultiplePagesWithSnippets) { ResultSpecProto result_spec; result_spec.set_num_per_page(2); - result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64); + result_spec.mutable_snippet_spec()->set_max_window_bytes(64); result_spec.mutable_snippet_spec()->set_num_matches_per_property(1); result_spec.mutable_snippet_spec()->set_num_to_snippet(3); @@ -2992,17 +2727,13 @@ TEST_F(IcingSearchEngineTest, OptimizationFailureUninitializesIcing) { }; ON_CALL(*mock_filesystem, CreateDirectoryRecursively) .WillByDefault(create_dir_lambda); - auto swap_lambda = [&just_swapped_files](const char* first_dir, const char* second_dir) { just_swapped_files = true; return false; }; - IcingSearchEngineOptions options = GetDefaultIcingOptions(); - ON_CALL(*mock_filesystem, SwapFiles(HasSubstr("document_dir_optimize_tmp"), - HasSubstr("document_dir"))) - .WillByDefault(swap_lambda); - TestIcingSearchEngine icing(options, std::move(mock_filesystem), + ON_CALL(*mock_filesystem, SwapFiles).WillByDefault(swap_lambda); + TestIcingSearchEngine icing(GetDefaultIcingOptions(), std::move(mock_filesystem), std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(), GetTestJniCache()); @@ -3455,16 +3186,11 @@ TEST_F(IcingSearchEngineTest, DeleteByQuery) { search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); DeleteByQueryResultProto result_proto = icing.DeleteByQuery(search_spec); EXPECT_THAT(result_proto.status(), ProtoIsOk()); - DeleteByQueryStatsProto exp_stats; + DeleteStatsProto exp_stats; + exp_stats.set_delete_type(DeleteStatsProto::DeleteType::QUERY); exp_stats.set_latency_ms(7); exp_stats.set_num_documents_deleted(1); - exp_stats.set_query_length(search_spec.query().length()); - exp_stats.set_num_terms(1); - exp_stats.set_num_namespaces_filtered(0); - exp_stats.set_num_schema_types_filtered(0); - exp_stats.set_parse_query_latency_ms(7); - exp_stats.set_document_removal_latency_ms(7); - EXPECT_THAT(result_proto.delete_by_query_stats(), EqualsProto(exp_stats)); + EXPECT_THAT(result_proto.delete_stats(), EqualsProto(exp_stats)); expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND); expected_get_result_proto.mutable_status()->set_message( @@ -3496,105 +3222,6 @@ TEST_F(IcingSearchEngineTest, DeleteByQuery) { expected_search_result_proto)); } -TEST_F(IcingSearchEngineTest, DeleteByQueryReturnInfo) { - DocumentProto document1 = - DocumentBuilder() - .SetKey("namespace1", "uri1") - .SetSchema("Message") - .AddStringProperty("body", "message body1") - .SetCreationTimestampMs(kDefaultCreationTimestampMs) - .Build(); - DocumentProto document2 = - DocumentBuilder() - .SetKey("namespace2", "uri2") - .SetSchema("Message") - .AddStringProperty("body", "message body2") - .SetCreationTimestampMs(kDefaultCreationTimestampMs) - .Build(); - DocumentProto document3 = - DocumentBuilder() - .SetKey("namespace2", "uri3") - .SetSchema("Message") - .AddStringProperty("body", "message body3") - .SetCreationTimestampMs(kDefaultCreationTimestampMs) - .Build(); - - auto fake_clock = std::make_unique<FakeClock>(); - fake_clock->SetTimerElapsedMilliseconds(7); - TestIcingSearchEngine icing(GetDefaultIcingOptions(), - std::make_unique<Filesystem>(), - std::make_unique<IcingFilesystem>(), - std::move(fake_clock), GetTestJniCache()); - ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk()); - - GetResultProto expected_get_result_proto; - expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); - *expected_get_result_proto.mutable_document() = document1; - EXPECT_THAT( - icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()), - EqualsProto(expected_get_result_proto)); - - *expected_get_result_proto.mutable_document() = document2; - EXPECT_THAT( - icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance()), - EqualsProto(expected_get_result_proto)); - - *expected_get_result_proto.mutable_document() = document3; - EXPECT_THAT( - icing.Get("namespace2", "uri3", GetResultSpecProto::default_instance()), - EqualsProto(expected_get_result_proto)); - - // Delete all docs to test the information is correctly grouped. - SearchSpecProto search_spec; - search_spec.set_query("message"); - search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); - DeleteByQueryResultProto result_proto = - icing.DeleteByQuery(search_spec, true); - EXPECT_THAT(result_proto.status(), ProtoIsOk()); - DeleteByQueryStatsProto exp_stats; - exp_stats.set_latency_ms(7); - exp_stats.set_num_documents_deleted(3); - exp_stats.set_query_length(search_spec.query().length()); - exp_stats.set_num_terms(1); - exp_stats.set_num_namespaces_filtered(0); - exp_stats.set_num_schema_types_filtered(0); - exp_stats.set_parse_query_latency_ms(7); - exp_stats.set_document_removal_latency_ms(7); - EXPECT_THAT(result_proto.delete_by_query_stats(), EqualsProto(exp_stats)); - - // Check that DeleteByQuery can return information for deleted documents. - DeleteByQueryResultProto::DocumentGroupInfo info1, info2; - info1.set_namespace_("namespace1"); - info1.set_schema("Message"); - info1.add_uris("uri1"); - info2.set_namespace_("namespace2"); - info2.set_schema("Message"); - info2.add_uris("uri3"); - info2.add_uris("uri2"); - EXPECT_THAT(result_proto.deleted_documents(), - UnorderedElementsAre(EqualsProto(info1), EqualsProto(info2))); - - EXPECT_THAT( - icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()) - .status() - .code(), - Eq(StatusProto::NOT_FOUND)); - EXPECT_THAT( - icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance()) - .status() - .code(), - Eq(StatusProto::NOT_FOUND)); - EXPECT_THAT( - icing.Get("namespace2", "uri3", GetResultSpecProto::default_instance()) - .status() - .code(), - Eq(StatusProto::NOT_FOUND)); -} - TEST_F(IcingSearchEngineTest, DeleteByQueryNotFound) { DocumentProto document1 = DocumentBuilder() @@ -3755,8 +3382,7 @@ TEST_F(IcingSearchEngineTest, IcingShouldWorkFineIfOptimizationIsAborted) { // fails. This will fail IcingSearchEngine::OptimizeDocumentStore() and makes // it return ABORTED_ERROR. auto mock_filesystem = std::make_unique<MockFilesystem>(); - ON_CALL(*mock_filesystem, - DeleteDirectoryRecursively(HasSubstr("_optimize_tmp"))) + ON_CALL(*mock_filesystem, DeleteDirectoryRecursively) .WillByDefault(Return(false)); TestIcingSearchEngine icing(GetDefaultIcingOptions(), @@ -3803,8 +3429,7 @@ TEST_F(IcingSearchEngineTest, // Creates a mock filesystem in which SwapFiles() always fails and deletes the // directories. This will fail IcingSearchEngine::OptimizeDocumentStore(). auto mock_filesystem = std::make_unique<MockFilesystem>(); - ON_CALL(*mock_filesystem, SwapFiles(HasSubstr("document_dir_optimize_tmp"), - HasSubstr("document_dir"))) + ON_CALL(*mock_filesystem, SwapFiles) .WillByDefault([this](const char* one, const char* two) { filesystem()->DeleteDirectoryRecursively(one); filesystem()->DeleteDirectoryRecursively(two); @@ -3875,8 +3500,7 @@ TEST_F(IcingSearchEngineTest, OptimizationShouldRecoverIfDataFilesAreMissing) { // Creates a mock filesystem in which SwapFiles() always fails and empties the // directories. This will fail IcingSearchEngine::OptimizeDocumentStore(). auto mock_filesystem = std::make_unique<MockFilesystem>(); - ON_CALL(*mock_filesystem, SwapFiles(HasSubstr("document_dir_optimize_tmp"), - HasSubstr("document_dir"))) + ON_CALL(*mock_filesystem, SwapFiles) .WillByDefault([this](const char* one, const char* two) { filesystem()->DeleteDirectoryRecursively(one); filesystem()->CreateDirectoryRecursively(one); @@ -5807,230 +5431,74 @@ TEST_F(IcingSearchEngineTest, SetSchemaCanDetectPreviousSchemaWasLost) { EqualsSearchResultIgnoreStatsAndScores(empty_result)); } -TEST_F(IcingSearchEngineTest, ImplicitPersistToDiskFullSavesEverything) { - DocumentProto document = CreateMessageDocument("namespace", "uri"); +TEST_F(IcingSearchEngineTest, PersistToDisk) { + GetResultProto expected_get_result_proto; + expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_get_result_proto.mutable_document() = + CreateMessageDocument("namespace", "uri"); + { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); - EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); - } // Destructing calls a PersistToDisk(FULL) - - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); - - // There should be no recovery since everything should be saved properly. - InitializeResultProto init_result = icing.Initialize(); - EXPECT_THAT(init_result.status(), ProtoIsOk()); - EXPECT_THAT(init_result.initialize_stats().document_store_data_status(), - Eq(InitializeStatsProto::NO_DATA_LOSS)); - EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(), - Eq(InitializeStatsProto::NONE)); - EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(), - Eq(InitializeStatsProto::NONE)); - EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(), - Eq(InitializeStatsProto::NONE)); - - // Schema is still intact. - GetSchemaResultProto expected_get_schema_result_proto; - expected_get_schema_result_proto.mutable_status()->set_code(StatusProto::OK); - *expected_get_schema_result_proto.mutable_schema() = CreateMessageSchema(); + EXPECT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(), + ProtoIsOk()); - EXPECT_THAT(icing.GetSchema(), EqualsProto(expected_get_schema_result_proto)); + // Persisting shouldn't affect anything + EXPECT_THAT(icing.PersistToDisk(PersistType::FULL).status(), ProtoIsOk()); - // Documents are still intact. - GetResultProto expected_get_result_proto; - expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); - *expected_get_result_proto.mutable_document() = document; + EXPECT_THAT( + icing.Get("namespace", "uri", GetResultSpecProto::default_instance()), + EqualsProto(expected_get_result_proto)); + } // Destructing persists as well + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); EXPECT_THAT( icing.Get("namespace", "uri", GetResultSpecProto::default_instance()), EqualsProto(expected_get_result_proto)); - - // Index is still intact. - SearchSpecProto search_spec; - search_spec.set_term_match_type(TermMatchType::PREFIX); - search_spec.set_query("message"); // Content in the Message document. - - SearchResultProto expected_search_result_proto; - expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); - *expected_search_result_proto.mutable_results()->Add()->mutable_document() = - document; - - SearchResultProto actual_results = - icing.Search(search_spec, GetDefaultScoringSpec(), - ResultSpecProto::default_instance()); - EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( - expected_search_result_proto)); } -TEST_F(IcingSearchEngineTest, ExplicitPersistToDiskFullSavesEverything) { - DocumentProto document = CreateMessageDocument("namespace", "uri"); - - // Add schema and documents to our first icing1 instance. +TEST_F(IcingSearchEngineTest, NoPersistToDiskLiteDoesntPersistPut) { IcingSearchEngine icing1(GetDefaultIcingOptions(), GetTestJniCache()); EXPECT_THAT(icing1.Initialize().status(), ProtoIsOk()); EXPECT_THAT(icing1.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); - EXPECT_THAT(icing1.Put(document).status(), ProtoIsOk()); - EXPECT_THAT(icing1.PersistToDisk(PersistType::FULL).status(), ProtoIsOk()); - - // Initialize a second icing2 instance which should have it's own memory - // space. If data from icing1 isn't being persisted to the files, then icing2 - // won't be able to see those changes. - IcingSearchEngine icing2(GetDefaultIcingOptions(), GetTestJniCache()); - - // There should be no recovery since everything should be saved properly. - InitializeResultProto init_result = icing2.Initialize(); - EXPECT_THAT(init_result.status(), ProtoIsOk()); - EXPECT_THAT(init_result.initialize_stats().document_store_data_status(), - Eq(InitializeStatsProto::NO_DATA_LOSS)); - EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(), - Eq(InitializeStatsProto::NONE)); - EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(), - Eq(InitializeStatsProto::NONE)); - EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(), - Eq(InitializeStatsProto::NONE)); - - // Schema is still intact. - GetSchemaResultProto expected_get_schema_result_proto; - expected_get_schema_result_proto.mutable_status()->set_code(StatusProto::OK); - *expected_get_schema_result_proto.mutable_schema() = CreateMessageSchema(); - - EXPECT_THAT(icing2.GetSchema(), - EqualsProto(expected_get_schema_result_proto)); - - // Documents are still intact. - GetResultProto expected_get_result_proto; - expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); - *expected_get_result_proto.mutable_document() = document; - - EXPECT_THAT( - icing2.Get("namespace", "uri", GetResultSpecProto::default_instance()), - EqualsProto(expected_get_result_proto)); - - // Index is still intact. - SearchSpecProto search_spec; - search_spec.set_term_match_type(TermMatchType::PREFIX); - search_spec.set_query("message"); // Content in the Message document. - - SearchResultProto expected_search_result_proto; - expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); - *expected_search_result_proto.mutable_results()->Add()->mutable_document() = - document; - - SearchResultProto actual_results = - icing2.Search(search_spec, GetDefaultScoringSpec(), - ResultSpecProto::default_instance()); - EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( - expected_search_result_proto)); -} - -TEST_F(IcingSearchEngineTest, NoPersistToDiskLosesAllDocumentsAndIndex) { - IcingSearchEngine icing1(GetDefaultIcingOptions(), GetTestJniCache()); - EXPECT_THAT(icing1.Initialize().status(), ProtoIsOk()); - EXPECT_THAT(icing1.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); - DocumentProto document = CreateMessageDocument("namespace", "uri"); - EXPECT_THAT(icing1.Put(document).status(), ProtoIsOk()); + DocumentProto document1 = CreateMessageDocument("namespace", "uri"); + EXPECT_THAT(icing1.Put(document1).status(), ProtoIsOk()); EXPECT_THAT( icing1.Get("namespace", "uri", GetResultSpecProto::default_instance()) .document(), - EqualsProto(document)); - - // It's intentional that no PersistToDisk call is made before initializing a - // second instance of icing. + EqualsProto(document1)); IcingSearchEngine icing2(GetDefaultIcingOptions(), GetTestJniCache()); - InitializeResultProto init_result = icing2.Initialize(); - EXPECT_THAT(init_result.status(), ProtoIsOk()); - EXPECT_THAT(init_result.initialize_stats().document_store_data_status(), - Eq(InitializeStatsProto::PARTIAL_LOSS)); - EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(), - Eq(InitializeStatsProto::DATA_LOSS)); - EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(), - Eq(InitializeStatsProto::NONE)); - EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(), - Eq(InitializeStatsProto::NONE)); - + EXPECT_THAT(icing2.Initialize().status(), ProtoIsOk()); // The document shouldn't be found because we forgot to call // PersistToDisk(LITE)! EXPECT_THAT( icing2.Get("namespace", "uri", GetResultSpecProto::default_instance()) .status(), ProtoStatusIs(StatusProto::NOT_FOUND)); - - // Searching also shouldn't get us anything because the index wasn't - // recovered. - SearchSpecProto search_spec; - search_spec.set_term_match_type(TermMatchType::PREFIX); - search_spec.set_query("message"); // Content in the Message document. - - SearchResultProto expected_search_result_proto; - expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); - - SearchResultProto actual_results = - icing2.Search(search_spec, GetDefaultScoringSpec(), - ResultSpecProto::default_instance()); - EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( - expected_search_result_proto)); } -TEST_F(IcingSearchEngineTest, PersistToDiskLiteSavesGroundTruth) { - DocumentProto document = CreateMessageDocument("namespace", "uri"); - +TEST_F(IcingSearchEngineTest, PersistToDiskLitePersistsPut) { IcingSearchEngine icing1(GetDefaultIcingOptions(), GetTestJniCache()); EXPECT_THAT(icing1.Initialize().status(), ProtoIsOk()); EXPECT_THAT(icing1.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); - EXPECT_THAT(icing1.Put(document).status(), ProtoIsOk()); + DocumentProto document1 = CreateMessageDocument("namespace", "uri"); + EXPECT_THAT(icing1.Put(document1).status(), ProtoIsOk()); EXPECT_THAT(icing1.PersistToDisk(PersistType::LITE).status(), ProtoIsOk()); EXPECT_THAT( icing1.Get("namespace", "uri", GetResultSpecProto::default_instance()) .document(), - EqualsProto(document)); + EqualsProto(document1)); IcingSearchEngine icing2(GetDefaultIcingOptions(), GetTestJniCache()); - InitializeResultProto init_result = icing2.Initialize(); - EXPECT_THAT(init_result.status(), ProtoIsOk()); - EXPECT_THAT(init_result.initialize_stats().document_store_data_status(), - Eq(InitializeStatsProto::NO_DATA_LOSS)); - EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(), - Eq(InitializeStatsProto::NONE)); - - // A checksum mismatch gets reported as an IO error. The document store and - // index didn't have their derived files included in the checksum previously, - // so reinitializing will trigger a checksum mismatch. - EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(), - Eq(InitializeStatsProto::IO_ERROR)); - EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(), - Eq(InitializeStatsProto::IO_ERROR)); - - // Schema is still intact. - GetSchemaResultProto expected_get_schema_result_proto; - expected_get_schema_result_proto.mutable_status()->set_code(StatusProto::OK); - *expected_get_schema_result_proto.mutable_schema() = CreateMessageSchema(); - - EXPECT_THAT(icing2.GetSchema(), - EqualsProto(expected_get_schema_result_proto)); - + EXPECT_THAT(icing2.Initialize().status(), ProtoIsOk()); // The document should be found because we called PersistToDisk(LITE)! EXPECT_THAT( icing2.Get("namespace", "uri", GetResultSpecProto::default_instance()) .document(), - EqualsProto(document)); - - // Recovered index is still intact. - SearchSpecProto search_spec; - search_spec.set_term_match_type(TermMatchType::PREFIX); - search_spec.set_query("message"); // Content in the Message document. - - SearchResultProto expected_search_result_proto; - expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); - *expected_search_result_proto.mutable_results()->Add()->mutable_document() = - document; - - SearchResultProto actual_results = - icing2.Search(search_spec, GetDefaultScoringSpec(), - ResultSpecProto::default_instance()); - EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( - expected_search_result_proto)); + EqualsProto(document1)); } TEST_F(IcingSearchEngineTest, ResetOk) { @@ -6123,7 +5591,7 @@ TEST_F(IcingSearchEngineTest, SnippetNormalization) { search_spec.set_query("mdi Zürich"); ResultSpecProto result_spec; - result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64); + result_spec.mutable_snippet_spec()->set_max_window_bytes(64); result_spec.mutable_snippet_spec()->set_num_matches_per_property(2); result_spec.mutable_snippet_spec()->set_num_to_snippet(2); @@ -6186,7 +5654,7 @@ TEST_F(IcingSearchEngineTest, SnippetNormalizationPrefix) { search_spec.set_query("md Zür"); ResultSpecProto result_spec; - result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64); + result_spec.mutable_snippet_spec()->set_max_window_bytes(64); result_spec.mutable_snippet_spec()->set_num_matches_per_property(2); result_spec.mutable_snippet_spec()->set_num_to_snippet(2); @@ -6241,7 +5709,7 @@ TEST_F(IcingSearchEngineTest, SnippetSectionRestrict) { search_spec.set_query("body:Zür"); ResultSpecProto result_spec; - result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64); + result_spec.mutable_snippet_spec()->set_max_window_bytes(64); result_spec.mutable_snippet_spec()->set_num_matches_per_property(10); result_spec.mutable_snippet_spec()->set_num_to_snippet(10); @@ -7514,6 +6982,10 @@ TEST_F(IcingSearchEngineTest, PutDocumentShouldLogIndexingStats) { // No merge should happen. EXPECT_THAT(put_result_proto.put_document_stats().index_merge_latency_ms(), Eq(0)); + // Number of tokens should not exceed. + EXPECT_FALSE(put_result_proto.put_document_stats() + .tokenization_stats() + .exceeded_max_token_num()); // The input document has 2 tokens. EXPECT_THAT(put_result_proto.put_document_stats() .tokenization_stats() @@ -7521,6 +6993,33 @@ TEST_F(IcingSearchEngineTest, PutDocumentShouldLogIndexingStats) { Eq(2)); } +TEST_F(IcingSearchEngineTest, PutDocumentShouldLogWhetherNumTokensExceeds) { + // Create a document with 2 tokens. + DocumentProto document = DocumentBuilder() + .SetKey("icing", "fake_type/0") + .SetSchema("Message") + .AddStringProperty("body", "message body") + .Build(); + + // Create an icing instance with max_tokens_per_doc = 1. + IcingSearchEngineOptions icing_options = GetDefaultIcingOptions(); + icing_options.set_max_tokens_per_doc(1); + IcingSearchEngine icing(icing_options, GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + + PutResultProto put_result_proto = icing.Put(document); + EXPECT_THAT(put_result_proto.status(), ProtoIsOk()); + // Number of tokens(2) exceeds the max allowed value(1). + EXPECT_TRUE(put_result_proto.put_document_stats() + .tokenization_stats() + .exceeded_max_token_num()); + EXPECT_THAT(put_result_proto.put_document_stats() + .tokenization_stats() + .num_tokens_indexed(), + Eq(1)); +} + TEST_F(IcingSearchEngineTest, PutDocumentShouldLogIndexMergeLatency) { DocumentProto document1 = DocumentBuilder() .SetKey("icing", "fake_type/1") @@ -7769,7 +7268,7 @@ TEST_F(IcingSearchEngineTest, QueryStatsProtoTest) { ResultSpecProto result_spec; result_spec.set_num_per_page(2); - result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64); + result_spec.mutable_snippet_spec()->set_max_window_bytes(64); result_spec.mutable_snippet_spec()->set_num_matches_per_property(1); result_spec.mutable_snippet_spec()->set_num_to_snippet(3); @@ -7980,7 +7479,7 @@ TEST_F(IcingSearchEngineTest, SnippetErrorTest) { ResultSpecProto result_spec; result_spec.mutable_snippet_spec()->set_num_to_snippet(2); result_spec.mutable_snippet_spec()->set_num_matches_per_property(3); - result_spec.mutable_snippet_spec()->set_max_window_utf32_length(4); + result_spec.mutable_snippet_spec()->set_max_window_bytes(4); SearchResultProto search_results = icing.Search(search_spec, scoring_spec, result_spec); @@ -8088,599 +7587,6 @@ TEST_F(IcingSearchEngineTest, CJKSnippetTest) { EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2)); } -TEST_F(IcingSearchEngineTest, InvalidToEmptyQueryTest) { - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); - ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); - - // String: "Luca Brasi sleeps with the 🐟🐟🐟." - // ^ ^ ^ ^ ^ ^ ^ ^ ^ - // UTF8 idx: 0 5 11 18 23 27 3135 39 - // UTF16 idx: 0 5 11 18 23 27 2931 33 - // Breaks into segments: "Luca", "Brasi", "sleeps", "with", "the", "🐟", "🐟" - // and "🐟". - constexpr std::string_view kSicilianMessage = - "Luca Brasi sleeps with the 🐟🐟🐟."; - DocumentProto document = DocumentBuilder() - .SetKey("namespace", "uri1") - .SetSchema("Message") - .AddStringProperty("body", kSicilianMessage) - .Build(); - ASSERT_THAT(icing.Put(document).status(), ProtoIsOk()); - DocumentProto document_two = - DocumentBuilder() - .SetKey("namespace", "uri2") - .SetSchema("Message") - .AddStringProperty("body", "Some other content.") - .Build(); - ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk()); - - // Search and request snippet matching but no windowing. - SearchSpecProto search_spec; - search_spec.set_query("?"); - search_spec.set_term_match_type(MATCH_PREFIX); - ScoringSpecProto scoring_spec; - ResultSpecProto result_spec; - - // Search and make sure that we got a single successful result - SearchResultProto search_results = - icing.Search(search_spec, scoring_spec, result_spec); - EXPECT_THAT(search_results.status(), ProtoIsOk()); - EXPECT_THAT(search_results.results(), SizeIs(2)); - - search_spec.set_query("。"); - search_results = icing.Search(search_spec, scoring_spec, result_spec); - EXPECT_THAT(search_results.status(), ProtoIsOk()); - EXPECT_THAT(search_results.results(), SizeIs(2)); - - search_spec.set_query("-"); - search_results = icing.Search(search_spec, scoring_spec, result_spec); - EXPECT_THAT(search_results.status(), ProtoIsOk()); - EXPECT_THAT(search_results.results(), SizeIs(2)); - - search_spec.set_query(":"); - search_results = icing.Search(search_spec, scoring_spec, result_spec); - EXPECT_THAT(search_results.status(), ProtoIsOk()); - EXPECT_THAT(search_results.results(), SizeIs(2)); - - search_spec.set_query("OR"); - search_results = icing.Search(search_spec, scoring_spec, result_spec); - EXPECT_THAT(search_results.status(), ProtoIsOk()); - EXPECT_THAT(search_results.results(), SizeIs(2)); - - search_spec.set_query(" "); - search_results = icing.Search(search_spec, scoring_spec, result_spec); - EXPECT_THAT(search_results.status(), ProtoIsOk()); - EXPECT_THAT(search_results.results(), SizeIs(2)); -} - -TEST_F(IcingSearchEngineTest, EmojiSnippetTest) { - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); - ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); - - // String: "Luca Brasi sleeps with the 🐟🐟🐟." - // ^ ^ ^ ^ ^ ^ ^ ^ ^ - // UTF8 idx: 0 5 11 18 23 27 3135 39 - // UTF16 idx: 0 5 11 18 23 27 2931 33 - // Breaks into segments: "Luca", "Brasi", "sleeps", "with", "the", "🐟", "🐟" - // and "🐟". - constexpr std::string_view kSicilianMessage = - "Luca Brasi sleeps with the 🐟🐟🐟."; - DocumentProto document = DocumentBuilder() - .SetKey("namespace", "uri1") - .SetSchema("Message") - .AddStringProperty("body", kSicilianMessage) - .Build(); - ASSERT_THAT(icing.Put(document).status(), ProtoIsOk()); - DocumentProto document_two = - DocumentBuilder() - .SetKey("namespace", "uri2") - .SetSchema("Message") - .AddStringProperty("body", "Some other content.") - .Build(); - ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk()); - - // Search and request snippet matching but no windowing. - SearchSpecProto search_spec; - search_spec.set_query("🐟"); - search_spec.set_term_match_type(MATCH_PREFIX); - - ResultSpecProto result_spec; - result_spec.mutable_snippet_spec()->set_num_to_snippet(1); - result_spec.mutable_snippet_spec()->set_num_matches_per_property(1); - - // Search and make sure that we got a single successful result - SearchResultProto search_results = icing.Search( - search_spec, ScoringSpecProto::default_instance(), result_spec); - ASSERT_THAT(search_results.status(), ProtoIsOk()); - ASSERT_THAT(search_results.results(), SizeIs(1)); - const SearchResultProto::ResultProto* result = &search_results.results(0); - EXPECT_THAT(result->document().uri(), Eq("uri1")); - - // Ensure that one and only one property was matched and it was "body" - ASSERT_THAT(result->snippet().entries(), SizeIs(1)); - const SnippetProto::EntryProto* entry = &result->snippet().entries(0); - EXPECT_THAT(entry->property_name(), Eq("body")); - - // Get the content for "subject" and see what the match is. - std::string_view content = GetString(&result->document(), "body"); - ASSERT_THAT(content, Eq(kSicilianMessage)); - - // Ensure that there is one and only one match within "subject" - ASSERT_THAT(entry->snippet_matches(), SizeIs(1)); - const SnippetMatchProto& match_proto = entry->snippet_matches(0); - - EXPECT_THAT(match_proto.exact_match_byte_position(), Eq(27)); - EXPECT_THAT(match_proto.exact_match_byte_length(), Eq(4)); - std::string_view match = - content.substr(match_proto.exact_match_byte_position(), - match_proto.exact_match_byte_length()); - ASSERT_THAT(match, Eq("🐟")); - - // Ensure that the utf-16 values are also as expected - EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(27)); - EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2)); -} - -TEST_F(IcingSearchEngineTest, PutDocumentIndexFailureDeletion) { - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); - ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); - - // Testing has shown that adding ~600,000 terms generated this way will - // fill up the hit buffer. - std::vector<std::string> terms = GenerateUniqueTerms(600000); - std::string content = absl_ports::StrJoin(terms, " "); - DocumentProto document = DocumentBuilder() - .SetKey("namespace", "uri1") - .SetSchema("Message") - .AddStringProperty("body", "foo " + content) - .Build(); - // We failed to add the document to the index fully. This means that we should - // reject the document from Icing entirely. - ASSERT_THAT(icing.Put(document).status(), - ProtoStatusIs(StatusProto::OUT_OF_SPACE)); - - // Make sure that the document isn't searchable. - SearchSpecProto search_spec; - search_spec.set_query("foo"); - search_spec.set_term_match_type(MATCH_PREFIX); - - SearchResultProto search_results = - icing.Search(search_spec, ScoringSpecProto::default_instance(), - ResultSpecProto::default_instance()); - ASSERT_THAT(search_results.status(), ProtoIsOk()); - ASSERT_THAT(search_results.results(), IsEmpty()); - - // Make sure that the document isn't retrievable. - GetResultProto get_result = - icing.Get("namespace", "uri1", GetResultSpecProto::default_instance()); - ASSERT_THAT(get_result.status(), ProtoStatusIs(StatusProto::NOT_FOUND)); -} - -TEST_F(IcingSearchEngineTest, SearchSuggestionsTest) { - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); - ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), - ProtoIsOk()); - - // Creates and inserts 6 documents, and index 6 termSix, 5 termFive, 4 - // termFour, 3 termThree, 2 termTwo and one termOne. - DocumentProto document1 = - DocumentBuilder() - .SetKey("namespace", "uri1") - .SetSchema("Email") - .SetCreationTimestampMs(10) - .AddStringProperty( - "subject", "termOne termTwo termThree termFour termFive termSix") - .Build(); - DocumentProto document2 = - DocumentBuilder() - .SetKey("namespace", "uri2") - .SetSchema("Email") - .SetCreationTimestampMs(10) - .AddStringProperty("subject", - "termTwo termThree termFour termFive termSix") - .Build(); - DocumentProto document3 = - DocumentBuilder() - .SetKey("namespace", "uri3") - .SetSchema("Email") - .SetCreationTimestampMs(10) - .AddStringProperty("subject", "termThree termFour termFive termSix") - .Build(); - DocumentProto document4 = - DocumentBuilder() - .SetKey("namespace", "uri4") - .SetSchema("Email") - .SetCreationTimestampMs(10) - .AddStringProperty("subject", "termFour termFive termSix") - .Build(); - DocumentProto document5 = - DocumentBuilder() - .SetKey("namespace", "uri5") - .SetSchema("Email") - .SetCreationTimestampMs(10) - .AddStringProperty("subject", "termFive termSix") - .Build(); - DocumentProto document6 = DocumentBuilder() - .SetKey("namespace", "uri6") - .SetSchema("Email") - .SetCreationTimestampMs(10) - .AddStringProperty("subject", "termSix") - .Build(); - ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(document4).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(document5).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(document6).status(), ProtoIsOk()); - - SuggestionSpecProto suggestion_spec; - suggestion_spec.set_prefix("t"); - suggestion_spec.set_num_to_return(10); - suggestion_spec.mutable_scoring_spec()->set_scoring_match_type( - TermMatchType::PREFIX); - - // Query all suggestions, and they will be ranked. - SuggestionResponse response = icing.SearchSuggestions(suggestion_spec); - ASSERT_THAT(response.status(), ProtoIsOk()); - ASSERT_THAT(response.suggestions().at(0).query(), "termsix"); - ASSERT_THAT(response.suggestions().at(1).query(), "termfive"); - ASSERT_THAT(response.suggestions().at(2).query(), "termfour"); - ASSERT_THAT(response.suggestions().at(3).query(), "termthree"); - ASSERT_THAT(response.suggestions().at(4).query(), "termtwo"); - ASSERT_THAT(response.suggestions().at(5).query(), "termone"); - - // Query first three suggestions, and they will be ranked. - suggestion_spec.set_num_to_return(3); - response = icing.SearchSuggestions(suggestion_spec); - ASSERT_THAT(response.status(), ProtoIsOk()); - ASSERT_THAT(response.suggestions().at(0).query(), "termsix"); - ASSERT_THAT(response.suggestions().at(1).query(), "termfive"); - ASSERT_THAT(response.suggestions().at(2).query(), "termfour"); -} - -TEST_F(IcingSearchEngineTest, - SearchSuggestionsTest_ShouldReturnInOneNamespace) { - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); - ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), - ProtoIsOk()); - - DocumentProto document1 = DocumentBuilder() - .SetKey("namespace1", "uri1") - .SetSchema("Email") - .SetCreationTimestampMs(10) - .AddStringProperty("subject", "foo fool") - .Build(); - DocumentProto document2 = DocumentBuilder() - .SetKey("namespace2", "uri2") - .SetSchema("Email") - .SetCreationTimestampMs(10) - .AddStringProperty("subject", "fool") - .Build(); - ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); - - SuggestionResponse::Suggestion suggestionFoo; - suggestionFoo.set_query("foo"); - SuggestionResponse::Suggestion suggestionFool; - suggestionFool.set_query("fool"); - - // namespace1 has 2 results. - SuggestionSpecProto suggestion_spec; - suggestion_spec.set_prefix("f"); - suggestion_spec.add_namespace_filters("namespace1"); - suggestion_spec.set_num_to_return(10); - suggestion_spec.mutable_scoring_spec()->set_scoring_match_type( - TermMatchType::PREFIX); - - SuggestionResponse response = icing.SearchSuggestions(suggestion_spec); - ASSERT_THAT(response.status(), ProtoIsOk()); - ASSERT_THAT(response.suggestions(), - UnorderedElementsAre(EqualsProto(suggestionFoo), - EqualsProto(suggestionFool))); -} - -TEST_F(IcingSearchEngineTest, - SearchSuggestionsTest_ShouldReturnInMultipleNamespace) { - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); - ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), - ProtoIsOk()); - - DocumentProto document1 = DocumentBuilder() - .SetKey("namespace1", "uri1") - .SetSchema("Email") - .SetCreationTimestampMs(10) - .AddStringProperty("subject", "fo") - .Build(); - DocumentProto document2 = DocumentBuilder() - .SetKey("namespace2", "uri2") - .SetSchema("Email") - .SetCreationTimestampMs(10) - .AddStringProperty("subject", "foo") - .Build(); - DocumentProto document3 = DocumentBuilder() - .SetKey("namespace3", "uri3") - .SetSchema("Email") - .SetCreationTimestampMs(10) - .AddStringProperty("subject", "fool") - .Build(); - ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk()); - - SuggestionResponse::Suggestion suggestionFoo; - suggestionFoo.set_query("foo"); - SuggestionResponse::Suggestion suggestionFool; - suggestionFool.set_query("fool"); - - // namespace2 and namespace3 has 2 results. - SuggestionSpecProto suggestion_spec; - suggestion_spec.set_prefix("f"); - suggestion_spec.add_namespace_filters("namespace2"); - suggestion_spec.add_namespace_filters("namespace3"); - suggestion_spec.set_num_to_return(10); - suggestion_spec.mutable_scoring_spec()->set_scoring_match_type( - TermMatchType::PREFIX); - - SuggestionResponse response = icing.SearchSuggestions(suggestion_spec); - ASSERT_THAT(response.status(), ProtoIsOk()); - ASSERT_THAT(response.suggestions(), - UnorderedElementsAre(EqualsProto(suggestionFoo), - EqualsProto(suggestionFool))); -} - -TEST_F(IcingSearchEngineTest, - SearchSuggestionsTest_OtherNamespaceDontContributeToHitCount) { - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); - ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), - ProtoIsOk()); - - // Index 4 documents, - // namespace1 has 2 hit2 for term one - // namespace2 has 2 hit2 for term two and 1 hit for term one. - DocumentProto document1 = DocumentBuilder() - .SetKey("namespace1", "uri1") - .SetSchema("Email") - .SetCreationTimestampMs(10) - .AddStringProperty("subject", "termone") - .Build(); - DocumentProto document2 = DocumentBuilder() - .SetKey("namespace1", "uri2") - .SetSchema("Email") - .SetCreationTimestampMs(10) - .AddStringProperty("subject", "termone") - .Build(); - DocumentProto document3 = DocumentBuilder() - .SetKey("namespace2", "uri2") - .SetSchema("Email") - .SetCreationTimestampMs(10) - .AddStringProperty("subject", "termone termtwo") - .Build(); - DocumentProto document4 = DocumentBuilder() - .SetKey("namespace2", "uri3") - .SetSchema("Email") - .SetCreationTimestampMs(10) - .AddStringProperty("subject", "termtwo") - .Build(); - ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(document4).status(), ProtoIsOk()); - - SuggestionResponse::Suggestion suggestionTermOne; - suggestionTermOne.set_query("termone"); - SuggestionResponse::Suggestion suggestionTermTwo; - suggestionTermTwo.set_query("termtwo"); - - // only search suggestion for namespace2. The correctly order should be - // {"termtwo", "termone"}. If we're not filtering out namespace1 when - // calculating our score, then it will be {"termone", "termtwo"}. - SuggestionSpecProto suggestion_spec; - suggestion_spec.set_prefix("t"); - suggestion_spec.add_namespace_filters("namespace2"); - suggestion_spec.set_num_to_return(10); - suggestion_spec.mutable_scoring_spec()->set_scoring_match_type( - TermMatchType::PREFIX); - - SuggestionResponse response = icing.SearchSuggestions(suggestion_spec); - ASSERT_THAT(response.status(), ProtoIsOk()); - ASSERT_THAT(response.suggestions(), - ElementsAre(EqualsProto(suggestionTermTwo), - EqualsProto(suggestionTermOne))); -} - -TEST_F(IcingSearchEngineTest, SearchSuggestionsTest_DeletionTest) { - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); - ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), - ProtoIsOk()); - - DocumentProto document1 = DocumentBuilder() - .SetKey("namespace1", "uri1") - .SetSchema("Email") - .SetCreationTimestampMs(10) - .AddStringProperty("subject", "fool") - .Build(); - DocumentProto document2 = DocumentBuilder() - .SetKey("namespace2", "uri2") - .SetSchema("Email") - .SetCreationTimestampMs(10) - .AddStringProperty("subject", "fool") - .Build(); - ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); - - SuggestionResponse::Suggestion suggestionFool; - suggestionFool.set_query("fool"); - - // namespace1 has this suggestion - SuggestionSpecProto suggestion_spec; - suggestion_spec.set_prefix("f"); - suggestion_spec.add_namespace_filters("namespace1"); - suggestion_spec.set_num_to_return(10); - suggestion_spec.mutable_scoring_spec()->set_scoring_match_type( - TermMatchType::PREFIX); - - SuggestionResponse response = icing.SearchSuggestions(suggestion_spec); - ASSERT_THAT(response.status(), ProtoIsOk()); - ASSERT_THAT(response.suggestions(), - UnorderedElementsAre(EqualsProto(suggestionFool))); - - // namespace2 has this suggestion - suggestion_spec.clear_namespace_filters(); - suggestion_spec.add_namespace_filters("namespace2"); - response = icing.SearchSuggestions(suggestion_spec); - ASSERT_THAT(response.status(), ProtoIsOk()); - ASSERT_THAT(response.suggestions(), - UnorderedElementsAre(EqualsProto(suggestionFool))); - - // delete document from namespace 1 - EXPECT_THAT(icing.Delete("namespace1", "uri1").status(), ProtoIsOk()); - - // Now namespace1 will return empty - suggestion_spec.clear_namespace_filters(); - suggestion_spec.add_namespace_filters("namespace1"); - response = icing.SearchSuggestions(suggestion_spec); - ASSERT_THAT(response.status(), ProtoIsOk()); - ASSERT_THAT(response.suggestions(), IsEmpty()); - - // namespace2 still has this suggestion, so we can prove the reason of - // namespace 1 cannot find it is we filter it out, not it doesn't exist. - suggestion_spec.add_namespace_filters("namespace2"); - response = icing.SearchSuggestions(suggestion_spec); - ASSERT_THAT(response.status(), ProtoIsOk()); - ASSERT_THAT(response.suggestions(), - UnorderedElementsAre(EqualsProto(suggestionFool))); -} - -TEST_F(IcingSearchEngineTest, SearchSuggestionsTest_ExpiredTest) { - DocumentProto document1 = DocumentBuilder() - .SetKey("namespace1", "uri1") - .SetSchema("Email") - .SetCreationTimestampMs(100) - .SetTtlMs(500) - .AddStringProperty("subject", "fool") - .Build(); - DocumentProto document2 = DocumentBuilder() - .SetKey("namespace2", "uri2") - .SetSchema("Email") - .SetCreationTimestampMs(100) - .SetTtlMs(1000) - .AddStringProperty("subject", "fool") - .Build(); - { - auto fake_clock = std::make_unique<FakeClock>(); - fake_clock->SetSystemTimeMilliseconds(400); - - TestIcingSearchEngine icing(GetDefaultIcingOptions(), - std::make_unique<Filesystem>(), - std::make_unique<IcingFilesystem>(), - std::move(fake_clock), GetTestJniCache()); - EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), - ProtoIsOk()); - - ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); - - SuggestionResponse::Suggestion suggestionFool; - suggestionFool.set_query("fool"); - - // namespace1 has this suggestion - SuggestionSpecProto suggestion_spec; - suggestion_spec.set_prefix("f"); - suggestion_spec.add_namespace_filters("namespace1"); - suggestion_spec.set_num_to_return(10); - suggestion_spec.mutable_scoring_spec()->set_scoring_match_type( - TermMatchType::PREFIX); - - SuggestionResponse response = icing.SearchSuggestions(suggestion_spec); - ASSERT_THAT(response.status(), ProtoIsOk()); - ASSERT_THAT(response.suggestions(), - UnorderedElementsAre(EqualsProto(suggestionFool))); - - // namespace2 has this suggestion - suggestion_spec.clear_namespace_filters(); - suggestion_spec.add_namespace_filters("namespace2"); - response = icing.SearchSuggestions(suggestion_spec); - ASSERT_THAT(response.status(), ProtoIsOk()); - ASSERT_THAT(response.suggestions(), - UnorderedElementsAre(EqualsProto(suggestionFool))); - } - // We reinitialize here so we can feed in a fake clock this time - { - // Time needs to be past document1 creation time (100) + ttl (500) for it - // to count as "expired". document2 is not expired since its ttl is 1000. - auto fake_clock = std::make_unique<FakeClock>(); - fake_clock->SetSystemTimeMilliseconds(800); - - TestIcingSearchEngine icing(GetDefaultIcingOptions(), - std::make_unique<Filesystem>(), - std::make_unique<IcingFilesystem>(), - std::move(fake_clock), GetTestJniCache()); - ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - - SuggestionSpecProto suggestion_spec; - suggestion_spec.set_prefix("f"); - suggestion_spec.add_namespace_filters("namespace1"); - suggestion_spec.set_num_to_return(10); - suggestion_spec.mutable_scoring_spec()->set_scoring_match_type( - TermMatchType::PREFIX); - - // Now namespace1 will return empty - suggestion_spec.clear_namespace_filters(); - suggestion_spec.add_namespace_filters("namespace1"); - SuggestionResponse response = icing.SearchSuggestions(suggestion_spec); - ASSERT_THAT(response.status(), ProtoIsOk()); - ASSERT_THAT(response.suggestions(), IsEmpty()); - - // namespace2 still has this suggestion - SuggestionResponse::Suggestion suggestionFool; - suggestionFool.set_query("fool"); - - suggestion_spec.add_namespace_filters("namespace2"); - response = icing.SearchSuggestions(suggestion_spec); - ASSERT_THAT(response.status(), ProtoIsOk()); - ASSERT_THAT(response.suggestions(), - UnorderedElementsAre(EqualsProto(suggestionFool))); - } -} - -TEST_F(IcingSearchEngineTest, SearchSuggestionsTest_emptyPrefix) { - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); - ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - - SuggestionSpecProto suggestion_spec; - suggestion_spec.set_prefix(""); - suggestion_spec.set_num_to_return(10); - suggestion_spec.mutable_scoring_spec()->set_scoring_match_type( - TermMatchType::PREFIX); - - ASSERT_THAT(icing.SearchSuggestions(suggestion_spec).status(), - ProtoStatusIs(StatusProto::INVALID_ARGUMENT)); -} - -TEST_F(IcingSearchEngineTest, SearchSuggestionsTest_NonPositiveNumToReturn) { - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); - ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - - SuggestionSpecProto suggestion_spec; - suggestion_spec.set_prefix("prefix"); - suggestion_spec.set_num_to_return(0); - suggestion_spec.mutable_scoring_spec()->set_scoring_match_type( - TermMatchType::PREFIX); - - ASSERT_THAT(icing.SearchSuggestions(suggestion_spec).status(), - ProtoStatusIs(StatusProto::INVALID_ARGUMENT)); -} - -#ifndef ICING_JNI_TEST // We skip this test case when we're running in a jni_test since the data files // will be stored in the android-instrumented storage location, rather than the // normal cc_library runfiles directory. To get that storage location, it's @@ -8690,6 +7596,12 @@ TEST_F(IcingSearchEngineTest, SearchSuggestionsTest_NonPositiveNumToReturn) { // this native side yet, we're just going to disable this. The functionality is // already well-tested across 4 different emulated OS's so we're not losing much // test coverage here. +#ifndef ICING_JNI_TEST +// Disable backwards compat test. This test is enabled in google3, but disabled +// in jetpack/framework because we didn't want to keep the binary testdata files +// in our repo. +#define DISABLE_BACKWARDS_COMPAT_TEST +#ifndef DISABLE_BACKWARDS_COMPAT_TEST TEST_F(IcingSearchEngineTest, MigrateToPortableFileBackedProtoLog) { // Copy the testdata files into our IcingSearchEngine directory std::string dir_without_portable_log; @@ -8729,7 +7641,7 @@ TEST_F(IcingSearchEngineTest, MigrateToPortableFileBackedProtoLog) { EXPECT_THAT(init_result.initialize_stats().document_store_data_status(), Eq(InitializeStatsProto::NO_DATA_LOSS)); EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(), - Eq(InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT)); + Eq(InitializeStatsProto::NONE)); EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(), Eq(InitializeStatsProto::NONE)); EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(), @@ -8843,6 +7755,7 @@ TEST_F(IcingSearchEngineTest, MigrateToPortableFileBackedProtoLog) { EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(expected_document3)); } +#endif // DISABLE_BACKWARDS_COMPAT_TEST #endif // !ICING_JNI_TEST } // namespace diff --git a/icing/index/index-processor.cc b/icing/index/index-processor.cc index 207c033..6d8632f 100644 --- a/icing/index/index-processor.cc +++ b/icing/index/index-processor.cc @@ -43,13 +43,14 @@ namespace lib { libtextclassifier3::StatusOr<std::unique_ptr<IndexProcessor>> IndexProcessor::Create(const Normalizer* normalizer, Index* index, + const IndexProcessor::Options& options, const Clock* clock) { ICING_RETURN_ERROR_IF_NULL(normalizer); ICING_RETURN_ERROR_IF_NULL(index); ICING_RETURN_ERROR_IF_NULL(clock); return std::unique_ptr<IndexProcessor>( - new IndexProcessor(normalizer, index, clock)); + new IndexProcessor(normalizer, index, options, clock)); } libtextclassifier3::Status IndexProcessor::IndexDocument( @@ -65,48 +66,53 @@ libtextclassifier3::Status IndexProcessor::IndexDocument( } index_->set_last_added_document_id(document_id); uint32_t num_tokens = 0; - libtextclassifier3::Status status; + libtextclassifier3::Status overall_status; for (const TokenizedSection& section : tokenized_document.sections()) { // TODO(b/152934343): pass real namespace ids in Index::Editor editor = index_->Edit(document_id, section.metadata.id, section.metadata.term_match_type, /*namespace_id=*/0); for (std::string_view token : section.token_sequence) { - ++num_tokens; - - switch (section.metadata.tokenizer) { - case StringIndexingConfig::TokenizerType::VERBATIM: - // data() is safe to use here because a token created from the - // VERBATIM tokenizer is the entire string value. The character at - // data() + token.length() is guaranteed to be a null char. - status = editor.BufferTerm(token.data()); - break; - case StringIndexingConfig::TokenizerType::NONE: - ICING_LOG(WARNING) - << "Unexpected TokenizerType::NONE found when indexing document."; - [[fallthrough]]; - case StringIndexingConfig::TokenizerType::PLAIN: - std::string normalized_term = normalizer_.NormalizeTerm(token); - status = editor.BufferTerm(normalized_term.c_str()); + if (++num_tokens > options_.max_tokens_per_document) { + // Index all tokens buffered so far. + editor.IndexAllBufferedTerms(); + if (put_document_stats != nullptr) { + put_document_stats->mutable_tokenization_stats() + ->set_exceeded_max_token_num(true); + put_document_stats->mutable_tokenization_stats() + ->set_num_tokens_indexed(options_.max_tokens_per_document); + } + switch (options_.token_limit_behavior) { + case Options::TokenLimitBehavior::kReturnError: + return absl_ports::ResourceExhaustedError( + "Max number of tokens reached!"); + case Options::TokenLimitBehavior::kSuppressError: + return overall_status; + } } - - if (!status.ok()) { - // We've encountered a failure. Bail out. We'll mark this doc as deleted - // and signal a failure to the client. - ICING_LOG(WARNING) << "Failed to buffer term in lite lexicon due to: " - << status.error_message(); - break; + std::string term = normalizer_.NormalizeTerm(token); + // Add this term to Hit buffer. Even if adding this hit fails, we keep + // trying to add more hits because it's possible that future hits could + // still be added successfully. For instance if the lexicon is full, we + // might fail to add a hit for a new term, but should still be able to + // add hits for terms that are already in the index. + auto status = editor.BufferTerm(term.c_str()); + if (overall_status.ok() && !status.ok()) { + // If we've succeeded to add everything so far, set overall_status to + // represent this new failure. If we've already failed, no need to + // update the status - we're already going to return a resource + // exhausted error. + overall_status = status; } } - if (!status.ok()) { - break; - } // Add all the seen terms to the index with their term frequency. - status = editor.IndexAllBufferedTerms(); - if (!status.ok()) { - ICING_LOG(WARNING) << "Failed to add hits in lite index due to: " - << status.error_message(); - break; + auto status = editor.IndexAllBufferedTerms(); + if (overall_status.ok() && !status.ok()) { + // If we've succeeded so far, set overall_status to + // represent this new failure. If we've already failed, no need to + // update the status - we're already going to return a resource + // exhausted error. + overall_status = status; } } @@ -117,11 +123,9 @@ libtextclassifier3::Status IndexProcessor::IndexDocument( num_tokens); } - // If we're either successful or we've hit resource exhausted, then attempt a - // merge. - if ((status.ok() || absl_ports::IsResourceExhausted(status)) && - index_->WantsMerge()) { - ICING_LOG(ERROR) << "Merging the index at docid " << document_id << "."; + // Merge if necessary. + if (overall_status.ok() && index_->WantsMerge()) { + ICING_VLOG(1) << "Merging the index at docid " << document_id << "."; std::unique_ptr<Timer> merge_timer = clock_.GetNewTimer(); libtextclassifier3::Status merge_status = index_->Merge(); @@ -146,7 +150,7 @@ libtextclassifier3::Status IndexProcessor::IndexDocument( } } - return status; + return overall_status; } } // namespace lib diff --git a/icing/index/index-processor.h b/icing/index/index-processor.h index 269e41c..6b07c98 100644 --- a/icing/index/index-processor.h +++ b/icing/index/index-processor.h @@ -32,6 +32,23 @@ namespace lib { class IndexProcessor { public: + struct Options { + int32_t max_tokens_per_document; + + // Indicates how a document exceeding max_tokens_per_document should be + // handled. + enum class TokenLimitBehavior { + // When set, the first max_tokens_per_document will be indexed. If the + // token count exceeds max_tokens_per_document, a ResourceExhausted error + // will be returned. + kReturnError, + // When set, the first max_tokens_per_document will be indexed. If the + // token count exceeds max_tokens_per_document, OK will be returned. + kSuppressError, + }; + TokenLimitBehavior token_limit_behavior; + }; + // Factory function to create an IndexProcessor which does not take ownership // of any input components, and all pointers must refer to valid objects that // outlive the created IndexProcessor instance. @@ -40,7 +57,8 @@ class IndexProcessor { // An IndexProcessor on success // FAILED_PRECONDITION if any of the pointers is null. static libtextclassifier3::StatusOr<std::unique_ptr<IndexProcessor>> Create( - const Normalizer* normalizer, Index* index, const Clock* clock); + const Normalizer* normalizer, Index* index, const Options& options, + const Clock* clock); // Add tokenized document to the index, associated with document_id. If the // number of tokens in the document exceeds max_tokens_per_document, then only @@ -66,11 +84,18 @@ class IndexProcessor { PutDocumentStatsProto* put_document_stats = nullptr); private: - IndexProcessor(const Normalizer* normalizer, Index* index, const Clock* clock) - : normalizer_(*normalizer), index_(index), clock_(*clock) {} + IndexProcessor(const Normalizer* normalizer, Index* index, + const Options& options, const Clock* clock) + : normalizer_(*normalizer), + index_(index), + options_(options), + clock_(*clock) {} + + std::string NormalizeToken(const Token& token); const Normalizer& normalizer_; Index* const index_; + const Options options_; const Clock& clock_; }; diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc index 1aad7d0..afeac4d 100644 --- a/icing/index/index-processor_benchmark.cc +++ b/icing/index/index-processor_benchmark.cc @@ -16,6 +16,7 @@ #include "gmock/gmock.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/index/index-processor.h" #include "icing/index/index.h" #include "icing/legacy/core/icing-string-util.h" @@ -23,7 +24,6 @@ #include "icing/schema/schema-util.h" #include "icing/schema/section-manager.h" #include "icing/testing/common-matchers.h" -#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" #include "icing/tokenization/language-segmenter-factory.h" @@ -168,6 +168,17 @@ void CleanUp(const Filesystem& filesystem, const std::string& index_dir) { filesystem.DeleteDirectoryRecursively(index_dir.c_str()); } +std::unique_ptr<IndexProcessor> CreateIndexProcessor( + const Normalizer* normalizer, Index* index, const Clock* clock) { + IndexProcessor::Options processor_options{}; + processor_options.max_tokens_per_document = 1024 * 1024 * 10; + processor_options.token_limit_behavior = + IndexProcessor::Options::TokenLimitBehavior::kReturnError; + + return IndexProcessor::Create(normalizer, index, processor_options, clock) + .ValueOrDie(); +} + void BM_IndexDocumentWithOneProperty(benchmark::State& state) { bool run_via_adb = absl::GetFlag(FLAGS_adb); if (!run_via_adb) { @@ -189,9 +200,9 @@ void BM_IndexDocumentWithOneProperty(benchmark::State& state) { std::unique_ptr<Normalizer> normalizer = CreateNormalizer(); Clock clock; std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(&clock); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<IndexProcessor> index_processor, - IndexProcessor::Create(normalizer.get(), index.get(), &clock)); + std::unique_ptr<IndexProcessor> index_processor = + CreateIndexProcessor(normalizer.get(), index.get(), &clock); + DocumentProto input_document = CreateDocumentWithOneProperty(state.range(0)); TokenizedDocument tokenized_document(std::move( TokenizedDocument::Create(schema_store.get(), language_segmenter.get(), @@ -243,9 +254,8 @@ void BM_IndexDocumentWithTenProperties(benchmark::State& state) { std::unique_ptr<Normalizer> normalizer = CreateNormalizer(); Clock clock; std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(&clock); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<IndexProcessor> index_processor, - IndexProcessor::Create(normalizer.get(), index.get(), &clock)); + std::unique_ptr<IndexProcessor> index_processor = + CreateIndexProcessor(normalizer.get(), index.get(), &clock); DocumentProto input_document = CreateDocumentWithTenProperties(state.range(0)); @@ -299,9 +309,8 @@ void BM_IndexDocumentWithDiacriticLetters(benchmark::State& state) { std::unique_ptr<Normalizer> normalizer = CreateNormalizer(); Clock clock; std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(&clock); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<IndexProcessor> index_processor, - IndexProcessor::Create(normalizer.get(), index.get(), &clock)); + std::unique_ptr<IndexProcessor> index_processor = + CreateIndexProcessor(normalizer.get(), index.get(), &clock); DocumentProto input_document = CreateDocumentWithDiacriticLetters(state.range(0)); @@ -355,9 +364,8 @@ void BM_IndexDocumentWithHiragana(benchmark::State& state) { std::unique_ptr<Normalizer> normalizer = CreateNormalizer(); Clock clock; std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(&clock); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<IndexProcessor> index_processor, - IndexProcessor::Create(normalizer.get(), index.get(), &clock)); + std::unique_ptr<IndexProcessor> index_processor = + CreateIndexProcessor(normalizer.get(), index.get(), &clock); DocumentProto input_document = CreateDocumentWithHiragana(state.range(0)); TokenizedDocument tokenized_document(std::move( diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc index 7746688..8a6a9f5 100644 --- a/icing/index/index-processor_test.cc +++ b/icing/index/index-processor_test.cc @@ -27,9 +27,9 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/absl_ports/str_cat.h" -#include "icing/absl_ports/str_join.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/index.h" #include "icing/index/iterator/doc-hit-info-iterator.h" @@ -48,8 +48,6 @@ #include "icing/store/document-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" -#include "icing/testing/icu-data-file-helper.h" -#include "icing/testing/random-string.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" #include "icing/tokenization/language-segmenter-factory.h" @@ -90,8 +88,6 @@ constexpr std::string_view kRepeatedProperty = "repeated"; constexpr std::string_view kSubProperty = "submessage"; constexpr std::string_view kNestedType = "NestedType"; constexpr std::string_view kNestedProperty = "nested"; -constexpr std::string_view kExactVerbatimProperty = "verbatimExact"; -constexpr std::string_view kPrefixedVerbatimProperty = "verbatimPrefixed"; constexpr DocumentId kDocumentId0 = 0; constexpr DocumentId kDocumentId1 = 1; @@ -100,8 +96,6 @@ constexpr SectionId kExactSectionId = 0; constexpr SectionId kPrefixedSectionId = 1; constexpr SectionId kRepeatedSectionId = 2; constexpr SectionId kNestedSectionId = 3; -constexpr SectionId kExactVerbatimSectionId = 4; -constexpr SectionId kPrefixedVerbatimSectionId = 5; using Cardinality = PropertyConfigProto::Cardinality; using DataType = PropertyConfigProto::DataType; @@ -110,23 +104,21 @@ using ::testing::Eq; using ::testing::IsEmpty; using ::testing::Test; -constexpr PropertyConfigProto::DataType::Code TYPE_STRING = - PropertyConfigProto::DataType::STRING; -constexpr PropertyConfigProto::DataType::Code TYPE_BYTES = - PropertyConfigProto::DataType::BYTES; +constexpr PropertyConfigProto_DataType_Code TYPE_STRING = + PropertyConfigProto_DataType_Code_STRING; +constexpr PropertyConfigProto_DataType_Code TYPE_BYTES = + PropertyConfigProto_DataType_Code_BYTES; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED = - PropertyConfigProto::Cardinality::REPEATED; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = + PropertyConfigProto_Cardinality_Code_OPTIONAL; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED = + PropertyConfigProto_Cardinality_Code_REPEATED; -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_VERBATIM = - StringIndexingConfig::TokenizerType::VERBATIM; +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = + StringIndexingConfig_TokenizerType_Code_PLAIN; -constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; -constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; +constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; +constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; class IndexProcessorTest : public Test { protected: @@ -153,12 +145,9 @@ class IndexProcessorTest : public Test { normalizer_factory::Create( /*max_term_byte_size=*/std::numeric_limits<int32_t>::max())); - std::string schema_store_dir = GetTestTempDir() + "/schema_store"; - ASSERT_TRUE( - filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str())); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_)); + SchemaStore::Create(&filesystem_, GetTestTempDir(), &fake_clock_)); SchemaProto schema = SchemaBuilder() .AddType( @@ -189,16 +178,6 @@ class IndexProcessorTest : public Test { .SetCardinality(CARDINALITY_REPEATED)) .AddProperty( PropertyConfigBuilder() - .SetName(kExactVerbatimProperty) - .SetDataTypeString(MATCH_EXACT, TOKENIZER_VERBATIM) - .SetCardinality(CARDINALITY_REPEATED)) - .AddProperty( - PropertyConfigBuilder() - .SetName(kPrefixedVerbatimProperty) - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_VERBATIM) - .SetCardinality(CARDINALITY_REPEATED)) - .AddProperty( - PropertyConfigBuilder() .SetName(kSubProperty) .SetDataTypeDocument( kNestedType, /*index_nested_properties=*/true) @@ -214,9 +193,15 @@ class IndexProcessorTest : public Test { .Build(); ICING_ASSERT_OK(schema_store_->SetSchema(schema)); + IndexProcessor::Options processor_options; + processor_options.max_tokens_per_document = 1000; + processor_options.token_limit_behavior = + IndexProcessor::Options::TokenLimitBehavior::kReturnError; + ICING_ASSERT_OK_AND_ASSIGN( index_processor_, - IndexProcessor::Create(normalizer_.get(), index_.get(), &fake_clock_)); + IndexProcessor::Create(normalizer_.get(), index_.get(), + processor_options, &fake_clock_)); mock_icing_filesystem_ = std::make_unique<IcingMockFilesystem>(); } @@ -247,12 +232,17 @@ std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) { } TEST_F(IndexProcessorTest, CreationWithNullPointerShouldFail) { + IndexProcessor::Options processor_options; + processor_options.max_tokens_per_document = 1000; + processor_options.token_limit_behavior = + IndexProcessor::Options::TokenLimitBehavior::kReturnError; + EXPECT_THAT(IndexProcessor::Create(/*normalizer=*/nullptr, index_.get(), - &fake_clock_), + processor_options, &fake_clock_), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); EXPECT_THAT(IndexProcessor::Create(normalizer_.get(), /*index=*/nullptr, - &fake_clock_), + processor_options, &fake_clock_), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); } @@ -444,68 +434,103 @@ TEST_F(IndexProcessorTest, DocWithRepeatedProperty) { kDocumentId0, std::vector<SectionId>{kRepeatedSectionId}))); } -// TODO(b/196771754) This test is disabled on Android because it takes too long -// to generate all of the unique terms and the test times out. Try storing these -// unique terms in a file that the test can read from. -#ifndef __ANDROID__ +TEST_F(IndexProcessorTest, TooManyTokensReturnError) { + // Only allow the first four tokens ("hello", "world", "good", "night") to be + // indexed. + IndexProcessor::Options options; + options.max_tokens_per_document = 4; + options.token_limit_behavior = + IndexProcessor::Options::TokenLimitBehavior::kReturnError; -TEST_F(IndexProcessorTest, HitBufferExhaustedTest) { - // Testing has shown that adding ~600,000 hits will fill up the hit buffer. - std::vector<std::string> unique_terms_ = GenerateUniqueTerms(200000); - std::string content = absl_ports::StrJoin(unique_terms_, " "); + ICING_ASSERT_OK_AND_ASSIGN( + index_processor_, IndexProcessor::Create(normalizer_.get(), index_.get(), + options, &fake_clock_)); DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) - .AddStringProperty(std::string(kExactProperty), content) - .AddStringProperty(std::string(kPrefixedProperty), content) - .AddStringProperty(std::string(kRepeatedProperty), content) + .AddStringProperty(std::string(kExactProperty), "hello world") + .AddStringProperty(std::string(kPrefixedProperty), "good night moon!") .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), - StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED, - testing::HasSubstr("Hit buffer is full!"))); + StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); + + // "night" should have been indexed. + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr, + index_->GetIterator("night", kSectionIdMaskAll, + TermMatchType::EXACT_ONLY)); + EXPECT_THAT(GetHits(std::move(itr)), + ElementsAre(EqualsDocHitInfo( + kDocumentId0, std::vector<SectionId>{kPrefixedSectionId}))); + + // "moon" should not have been. + ICING_ASSERT_OK_AND_ASSIGN(itr, + index_->GetIterator("moon", kSectionIdMaskAll, + TermMatchType::EXACT_ONLY)); + EXPECT_THAT(GetHits(std::move(itr)), IsEmpty()); } -TEST_F(IndexProcessorTest, LexiconExhaustedTest) { - // Testing has shown that adding ~300,000 terms generated this way will - // fill up the lexicon. - std::vector<std::string> unique_terms_ = GenerateUniqueTerms(300000); - std::string content = absl_ports::StrJoin(unique_terms_, " "); +TEST_F(IndexProcessorTest, TooManyTokensSuppressError) { + // Only allow the first four tokens ("hello", "world", "good", "night") to be + // indexed. + IndexProcessor::Options options; + options.max_tokens_per_document = 4; + options.token_limit_behavior = + IndexProcessor::Options::TokenLimitBehavior::kSuppressError; + + ICING_ASSERT_OK_AND_ASSIGN( + index_processor_, IndexProcessor::Create(normalizer_.get(), index_.get(), + options, &fake_clock_)); DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) - .AddStringProperty(std::string(kExactProperty), content) + .AddStringProperty(std::string(kExactProperty), "hello world") + .AddStringProperty(std::string(kPrefixedProperty), "good night moon!") .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), - StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED, - testing::HasSubstr("Unable to add term"))); + IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); -} -#endif // __ANDROID__ + // "night" should have been indexed. + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr, + index_->GetIterator("night", kSectionIdMaskAll, + TermMatchType::EXACT_ONLY)); + EXPECT_THAT(GetHits(std::move(itr)), + ElementsAre(EqualsDocHitInfo( + kDocumentId0, std::vector<SectionId>{kPrefixedSectionId}))); + + // "moon" should not have been. + ICING_ASSERT_OK_AND_ASSIGN(itr, + index_->GetIterator("moon", kSectionIdMaskAll, + TermMatchType::EXACT_ONLY)); + EXPECT_THAT(GetHits(std::move(itr)), IsEmpty()); +} TEST_F(IndexProcessorTest, TooLongTokens) { // Only allow the tokens of length four, truncating "hello", "world" and // "night". + IndexProcessor::Options options; + options.max_tokens_per_document = 1000; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Normalizer> normalizer, normalizer_factory::Create( /*max_term_byte_size=*/4)); ICING_ASSERT_OK_AND_ASSIGN( - index_processor_, - IndexProcessor::Create(normalizer.get(), index_.get(), &fake_clock_)); + index_processor_, IndexProcessor::Create(normalizer.get(), index_.get(), + options, &fake_clock_)); DocumentProto document = DocumentBuilder() @@ -667,6 +692,16 @@ TEST_F(IndexProcessorTest, NonAsciiIndexing) { lang_segmenter_, language_segmenter_factory::Create(std::move(segmenter_options))); + IndexProcessor::Options processor_options; + processor_options.max_tokens_per_document = 1000; + processor_options.token_limit_behavior = + IndexProcessor::Options::TokenLimitBehavior::kReturnError; + + ICING_ASSERT_OK_AND_ASSIGN( + index_processor_, + IndexProcessor::Create(normalizer_.get(), index_.get(), processor_options, + &fake_clock_)); + DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") @@ -692,13 +727,23 @@ TEST_F(IndexProcessorTest, NonAsciiIndexing) { TEST_F(IndexProcessorTest, LexiconFullIndexesSmallerTokensReturnsResourceExhausted) { + IndexProcessor::Options processor_options; + processor_options.max_tokens_per_document = 1000; + processor_options.token_limit_behavior = + IndexProcessor::Options::TokenLimitBehavior::kReturnError; + + ICING_ASSERT_OK_AND_ASSIGN( + index_processor_, + IndexProcessor::Create(normalizer_.get(), index_.get(), processor_options, + &fake_clock_)); + // This is the maximum token length that an empty lexicon constructed for a // lite index with merge size of 1MiB can support. constexpr int kMaxTokenLength = 16777217; // Create a string "ppppppp..." with a length that is too large to fit into // the lexicon. std::string enormous_string(kMaxTokenLength + 1, 'p'); - DocumentProto document_one = + DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) @@ -709,10 +754,24 @@ TEST_F(IndexProcessorTest, ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), - document_one)); + document)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> itr, + index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); + EXPECT_THAT(GetHits(std::move(itr)), + ElementsAre(EqualsDocHitInfo( + kDocumentId0, std::vector<SectionId>{kExactSectionId}))); + + ICING_ASSERT_OK_AND_ASSIGN( + itr, + index_->GetIterator("baz", kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); + EXPECT_THAT(GetHits(std::move(itr)), + ElementsAre(EqualsDocHitInfo( + kDocumentId0, std::vector<SectionId>{kPrefixedSectionId}))); } TEST_F(IndexProcessorTest, IndexingDocAutomaticMerge) { @@ -736,9 +795,15 @@ TEST_F(IndexProcessorTest, IndexingDocAutomaticMerge) { ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, &icing_filesystem_)); + IndexProcessor::Options processor_options; + processor_options.max_tokens_per_document = 1000; + processor_options.token_limit_behavior = + IndexProcessor::Options::TokenLimitBehavior::kReturnError; + ICING_ASSERT_OK_AND_ASSIGN( index_processor_, - IndexProcessor::Create(normalizer_.get(), index_.get(), &fake_clock_)); + IndexProcessor::Create(normalizer_.get(), index_.get(), processor_options, + &fake_clock_)); DocumentId doc_id = 0; // Have determined experimentally that indexing 3373 documents with this text // will cause the LiteIndex to fill up. Further indexing will fail unless the @@ -792,9 +857,15 @@ TEST_F(IndexProcessorTest, IndexingDocMergeFailureResets) { index_, Index::Create(options, &filesystem_, mock_icing_filesystem_.get())); + IndexProcessor::Options processor_options; + processor_options.max_tokens_per_document = 1000; + processor_options.token_limit_behavior = + IndexProcessor::Options::TokenLimitBehavior::kReturnError; + ICING_ASSERT_OK_AND_ASSIGN( index_processor_, - IndexProcessor::Create(normalizer_.get(), index_.get(), &fake_clock_)); + IndexProcessor::Create(normalizer_.get(), index_.get(), processor_options, + &fake_clock_)); // 3. Index one document. This should fit in the LiteIndex without requiring a // merge. @@ -816,95 +887,6 @@ TEST_F(IndexProcessorTest, IndexingDocMergeFailureResets) { EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id)); } -TEST_F(IndexProcessorTest, ExactVerbatimProperty) { - DocumentProto document = - DocumentBuilder() - .SetKey("icing", "fake_type/1") - .SetSchema(std::string(kFakeType)) - .AddStringProperty(std::string(kExactVerbatimProperty), - "Hello, world!") - .Build(); - ICING_ASSERT_OK_AND_ASSIGN( - TokenizedDocument tokenized_document, - TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), - document)); - EXPECT_THAT(tokenized_document.num_tokens(), 1); - - EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), - IsOk()); - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<DocHitInfoIterator> itr, - index_->GetIterator("Hello, world!", kSectionIdMaskAll, - TermMatchType::EXACT_ONLY)); - std::vector<DocHitInfo> hits = GetHits(std::move(itr)); - std::unordered_map<SectionId, Hit::TermFrequency> expectedMap{ - {kExactVerbatimSectionId, 1}}; - - EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( - kDocumentId0, expectedMap))); -} - -TEST_F(IndexProcessorTest, PrefixVerbatimProperty) { - DocumentProto document = - DocumentBuilder() - .SetKey("icing", "fake_type/1") - .SetSchema(std::string(kFakeType)) - .AddStringProperty(std::string(kPrefixedVerbatimProperty), - "Hello, world!") - .Build(); - ICING_ASSERT_OK_AND_ASSIGN( - TokenizedDocument tokenized_document, - TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), - document)); - EXPECT_THAT(tokenized_document.num_tokens(), 1); - - EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), - IsOk()); - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); - - // We expect to match the document we indexed as "Hello, w" is a prefix - // of "Hello, world!" - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr, - index_->GetIterator("Hello, w", kSectionIdMaskAll, - TermMatchType::PREFIX)); - std::vector<DocHitInfo> hits = GetHits(std::move(itr)); - std::unordered_map<SectionId, Hit::TermFrequency> expectedMap{ - {kPrefixedVerbatimSectionId, 1}}; - - EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( - kDocumentId0, expectedMap))); -} - -TEST_F(IndexProcessorTest, VerbatimPropertyDoesntMatchSubToken) { - DocumentProto document = - DocumentBuilder() - .SetKey("icing", "fake_type/1") - .SetSchema(std::string(kFakeType)) - .AddStringProperty(std::string(kPrefixedVerbatimProperty), - "Hello, world!") - .Build(); - ICING_ASSERT_OK_AND_ASSIGN( - TokenizedDocument tokenized_document, - TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), - document)); - EXPECT_THAT(tokenized_document.num_tokens(), 1); - - EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), - IsOk()); - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<DocHitInfoIterator> itr, - index_->GetIterator("world", kSectionIdMaskAll, TermMatchType::PREFIX)); - std::vector<DocHitInfo> hits = GetHits(std::move(itr)); - - // We should not have hits for term "world" as the index processor should - // create a sole token "Hello, world! for the document. - EXPECT_THAT(hits, IsEmpty()); -} - } // namespace } // namespace lib diff --git a/icing/index/index.cc b/icing/index/index.cc index 02ba699..db59ad2 100644 --- a/icing/index/index.cc +++ b/icing/index/index.cc @@ -36,7 +36,6 @@ #include "icing/legacy/index/icing-filesystem.h" #include "icing/proto/term.pb.h" #include "icing/schema/section.h" -#include "icing/scoring/ranker.h" #include "icing/store/document-id.h" #include "icing/util/logging.h" #include "icing/util/status-macros.h" @@ -71,25 +70,39 @@ IcingDynamicTrie::Options GetMainLexiconOptions() { return IcingDynamicTrie::Options(); } -enum class MergeAction { kTakeLiteTerm, kTakeMainTerm, kMergeTerms }; +// Helper function to check if a term is in the given namespaces. +// TODO(tjbarron): Implement a method PropertyReadersAll.HasAnyProperty(). +bool IsTermInNamespaces( + const IcingDynamicTrie::PropertyReadersAll& property_reader, + uint32_t value_index, const std::vector<NamespaceId>& namespace_ids) { + if (namespace_ids.empty()) { + return true; + } + for (NamespaceId namespace_id : namespace_ids) { + if (property_reader.HasProperty(GetNamespacePropertyId(namespace_id), + value_index)) { + return true; + } + } + + return false; +} -// Merge the TermMetadata from lite index and main index. If the term exists in -// both index, sum up its hit count and push it to the term heap. -// The heap is a min-heap. So that we can avoid some push operation but the time -// complexity is O(NlgK) which N is total number of term and K is num_to_return. -std::vector<TermMetadata> MergeAndRankTermMetadatas( +enum class MergeAction { kTakeLiteTerm, kTakeMainTerm, kMergeTerms }; +std::vector<TermMetadata> MergeTermMetadatas( std::vector<TermMetadata> lite_term_metadata_list, std::vector<TermMetadata> main_term_metadata_list, int num_to_return) { - std::vector<TermMetadata> merged_term_metadata_heap; - merged_term_metadata_heap.reserve( + std::vector<TermMetadata> merged_term_metadata_list; + merged_term_metadata_list.reserve( std::min(lite_term_metadata_list.size() + main_term_metadata_list.size(), static_cast<size_t>(num_to_return))); auto lite_term_itr = lite_term_metadata_list.begin(); auto main_term_itr = main_term_metadata_list.begin(); MergeAction merge_action; - while (lite_term_itr != lite_term_metadata_list.end() || - main_term_itr != main_term_metadata_list.end()) { + while (merged_term_metadata_list.size() < num_to_return && + (lite_term_itr != lite_term_metadata_list.end() || + main_term_itr != main_term_metadata_list.end())) { // Get pointers to the next metadatas in each group, if available // Determine how to merge. if (main_term_itr == main_term_metadata_list.end()) { @@ -106,32 +119,23 @@ std::vector<TermMetadata> MergeAndRankTermMetadatas( } switch (merge_action) { case MergeAction::kTakeLiteTerm: - PushToTermHeap(std::move(*lite_term_itr), num_to_return, - merged_term_metadata_heap); + merged_term_metadata_list.push_back(std::move(*lite_term_itr)); ++lite_term_itr; break; case MergeAction::kTakeMainTerm: - PushToTermHeap(std::move(*main_term_itr), num_to_return, - merged_term_metadata_heap); + merged_term_metadata_list.push_back(std::move(*main_term_itr)); ++main_term_itr; break; case MergeAction::kMergeTerms: int total_est_hit_count = lite_term_itr->hit_count + main_term_itr->hit_count; - PushToTermHeap(TermMetadata(std::move(lite_term_itr->content), - total_est_hit_count), - num_to_return, merged_term_metadata_heap); + merged_term_metadata_list.emplace_back( + std::move(lite_term_itr->content), total_est_hit_count); ++lite_term_itr; ++main_term_itr; break; } } - // Reverse the list since we pop them from a min heap and we need to return in - // decreasing order. - std::vector<TermMetadata> merged_term_metadata_list = - PopAllTermsFromHeap(merged_term_metadata_heap); - std::reverse(merged_term_metadata_list.begin(), - merged_term_metadata_list.end()); return merged_term_metadata_list; } @@ -210,56 +214,77 @@ Index::GetIterator(const std::string& term, SectionIdMask section_id_mask, libtextclassifier3::StatusOr<std::vector<TermMetadata>> Index::FindLiteTermsByPrefix(const std::string& prefix, - const NamespaceChecker* namespace_checker) { + const std::vector<NamespaceId>& namespace_ids, + int num_to_return) { // Finds all the terms that start with the given prefix in the lexicon. IcingDynamicTrie::Iterator term_iterator(lite_index_->lexicon(), prefix.c_str()); + // A property reader to help check if a term has some property. + IcingDynamicTrie::PropertyReadersAll property_reader(lite_index_->lexicon()); + std::vector<TermMetadata> term_metadata_list; - while (term_iterator.IsValid()) { + while (term_iterator.IsValid() && term_metadata_list.size() < num_to_return) { uint32_t term_value_index = term_iterator.GetValueIndex(); + // Skips the terms that don't exist in the given namespaces. We won't skip + // any terms if namespace_ids is empty. + if (!IsTermInNamespaces(property_reader, term_value_index, namespace_ids)) { + term_iterator.Advance(); + continue; + } + ICING_ASSIGN_OR_RETURN( uint32_t term_id, term_id_codec_->EncodeTvi(term_value_index, TviType::LITE), absl_ports::InternalError("Failed to access terms in lexicon.")); - ICING_ASSIGN_OR_RETURN(int hit_count, - lite_index_->CountHits(term_id, namespace_checker)); - if (hit_count > 0) { - // There is at least one document in the given namespace has this term. - term_metadata_list.push_back( - TermMetadata(term_iterator.GetKey(), hit_count)); - } + + term_metadata_list.emplace_back(term_iterator.GetKey(), + lite_index_->CountHits(term_id)); term_iterator.Advance(); } + if (term_iterator.IsValid()) { + // We exited the loop above because we hit the num_to_return limit. + ICING_LOG(WARNING) << "Ran into limit of " << num_to_return + << " retrieving suggestions for " << prefix + << ". Some suggestions may not be returned and others " + "may be misranked."; + } return term_metadata_list; } libtextclassifier3::StatusOr<std::vector<TermMetadata>> -Index::FindTermsByPrefix(const std::string& prefix, int num_to_return, - TermMatchType::Code term_match_type, - const NamespaceChecker* namespace_checker) { +Index::FindTermsByPrefix(const std::string& prefix, + const std::vector<NamespaceId>& namespace_ids, + int num_to_return) { std::vector<TermMetadata> term_metadata_list; if (num_to_return <= 0) { return term_metadata_list; } + // Get results from the LiteIndex. - ICING_ASSIGN_OR_RETURN(std::vector<TermMetadata> lite_term_metadata_list, - FindLiteTermsByPrefix(prefix, namespace_checker)); + ICING_ASSIGN_OR_RETURN( + std::vector<TermMetadata> lite_term_metadata_list, + FindLiteTermsByPrefix(prefix, namespace_ids, num_to_return)); + // Append results from the MainIndex. - ICING_ASSIGN_OR_RETURN(std::vector<TermMetadata> main_term_metadata_list, - main_index_->FindTermsByPrefix(prefix, term_match_type, - namespace_checker)); - return MergeAndRankTermMetadatas(std::move(lite_term_metadata_list), - std::move(main_term_metadata_list), - num_to_return); + ICING_ASSIGN_OR_RETURN( + std::vector<TermMetadata> main_term_metadata_list, + main_index_->FindTermsByPrefix(prefix, namespace_ids, num_to_return)); + + return MergeTermMetadatas(std::move(lite_term_metadata_list), + std::move(main_term_metadata_list), num_to_return); } IndexStorageInfoProto Index::GetStorageInfo() const { IndexStorageInfoProto storage_info; int64_t directory_size = filesystem_->GetDiskUsage(options_.base_dir.c_str()); - storage_info.set_index_size(Filesystem::SanitizeFileSize(directory_size)); + if (directory_size != Filesystem::kBadFileSize) { + storage_info.set_index_size(directory_size); + } else { + storage_info.set_index_size(-1); + } storage_info = lite_index_->GetStorageInfo(std::move(storage_info)); return main_index_->GetStorageInfo(std::move(storage_info)); } diff --git a/icing/index/index.h b/icing/index/index.h index 5c53349..eab5be8 100644 --- a/icing/index/index.h +++ b/icing/index/index.h @@ -32,12 +32,10 @@ #include "icing/index/term-id-codec.h" #include "icing/index/term-metadata.h" #include "icing/legacy/index/icing-filesystem.h" -#include "icing/proto/debug.pb.h" #include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" -#include "icing/store/namespace-checker.h" #include "icing/store/namespace-id.h" #include "icing/util/crc32.h" @@ -144,14 +142,9 @@ class Index { // index. // verbosity > 0, more detailed debug information including raw postings // lists. - IndexDebugInfoProto GetDebugInfo(int verbosity) const { - IndexDebugInfoProto debug_info; - *debug_info.mutable_index_storage_info() = GetStorageInfo(); - *debug_info.mutable_lite_index_info() = - lite_index_->GetDebugInfo(verbosity); - *debug_info.mutable_main_index_info() = - main_index_->GetDebugInfo(verbosity); - return debug_info; + void GetDebugInfo(int verbosity, std::string* out) const { + lite_index_->GetDebugInfo(verbosity, out); + main_index_->GetDebugInfo(verbosity, out); } // Returns the byte size of the all the elements held in the index. This @@ -188,17 +181,17 @@ class Index { TermMatchType::Code term_match_type); // Finds terms with the given prefix in the given namespaces. If - // 'namespace_ids' is empty, returns results from all the namespaces. Results - // are sorted in decreasing order of hit count. Number of results are no more - // than 'num_to_return'. + // 'namespace_ids' is empty, returns results from all the namespaces. The + // input prefix must be normalized, otherwise inaccurate results may be + // returned. Results are not sorted specifically and are in their original + // order. Number of results are no more than 'num_to_return'. // // Returns: // A list of TermMetadata on success // INTERNAL_ERROR if failed to access term data. libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix( - const std::string& prefix, int num_to_return, - TermMatchType::Code term_match_type, - const NamespaceChecker* namespace_checker); + const std::string& prefix, const std::vector<NamespaceId>& namespace_ids, + int num_to_return); // A class that can be used to add hits to the index. // @@ -274,7 +267,8 @@ class Index { filesystem_(filesystem) {} libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindLiteTermsByPrefix( - const std::string& prefix, const NamespaceChecker* namespace_checker); + const std::string& prefix, const std::vector<NamespaceId>& namespace_ids, + int num_to_return); std::unique_ptr<LiteIndex> lite_index_; std::unique_ptr<MainIndex> main_index_; diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc index 8355c01..16593ef 100644 --- a/icing/index/index_test.cc +++ b/icing/index/index_test.cc @@ -31,12 +31,10 @@ #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/legacy/index/icing-filesystem.h" #include "icing/legacy/index/icing-mock-filesystem.h" -#include "icing/proto/debug.pb.h" #include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" -#include "icing/testing/always-true-namespace-checker-impl.h" #include "icing/testing/common-matchers.h" #include "icing/testing/random-string.h" #include "icing/testing/tmp-directory.h" @@ -90,10 +88,18 @@ constexpr DocumentId kDocumentId4 = 4; constexpr DocumentId kDocumentId5 = 5; constexpr DocumentId kDocumentId6 = 6; constexpr DocumentId kDocumentId7 = 7; -constexpr DocumentId kDocumentId8 = 8; constexpr SectionId kSectionId2 = 2; constexpr SectionId kSectionId3 = 3; +// The value returned by IndexBlock::ApproximateFullPostingListHitsForBlock( +// GetBlockSize(), +// GetPostingListIndexBits(posting_list_utils::min_posting_list_size())); +constexpr int kMinSizePlApproxHits = 3; +// The value returned by IndexBlock::ApproximateFullPostingListHitsForBlock( +// GetBlockSize(), +// GetPostingListIndexBits(2 * posting_list_utils::min_posting_list_size())); +constexpr int kSecondSmallestPlApproxHits = 7; + std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) { std::vector<DocHitInfo> infos; while (iterator->Advance().ok()) { @@ -909,306 +915,217 @@ TEST_F(IndexTest, InvalidHitBufferSize) { TEST_F(IndexTest, FindTermByPrefixShouldReturnEmpty) { Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - AlwaysTrueNamespaceCheckerImpl impl; EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*num_to_return=*/0, - TermMatchType::PREFIX, &impl), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0}, + /*num_to_return=*/0), IsOkAndHolds(IsEmpty())); - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", - /*num_to_return=*/-1, - TermMatchType::PREFIX, &impl), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0}, + /*num_to_return=*/-1), IsOkAndHolds(IsEmpty())); ICING_ASSERT_OK(index_->Merge()); - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", - /*num_to_return=*/0, - TermMatchType::PREFIX, &impl), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0}, + /*num_to_return=*/0), IsOkAndHolds(IsEmpty())); - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", - /*num_to_return=*/-1, - TermMatchType::PREFIX, &impl), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0}, + /*num_to_return=*/-1), IsOkAndHolds(IsEmpty())); } TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectResult) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - AlwaysTrueNamespaceCheckerImpl impl; EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit.BufferTerm("bar"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); // "b" should only match "bar" but not "foo". - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b", - /*num_to_return=*/10, - TermMatchType::PREFIX, &impl), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b", /*namespace_ids=*/{0}, + /*num_to_return=*/10), IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("bar", 1)))); ICING_ASSERT_OK(index_->Merge()); // "b" should only match "bar" but not "foo". - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b", - /*num_to_return=*/10, - TermMatchType::PREFIX, &impl), - IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("bar", 1)))); + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b", /*namespace_ids=*/{0}, + /*num_to_return=*/10), + IsOkAndHolds(UnorderedElementsAre( + EqualsTermMetadata("bar", kMinSizePlApproxHits)))); } TEST_F(IndexTest, FindTermByPrefixShouldRespectNumToReturn) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - AlwaysTrueNamespaceCheckerImpl impl; EXPECT_THAT(edit.BufferTerm("fo"), IsOk()); EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); // We have 3 results but only 2 should be returned. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", - /*num_to_return=*/2, - TermMatchType::PREFIX, &impl), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, + /*num_to_return=*/2), IsOkAndHolds(SizeIs(2))); ICING_ASSERT_OK(index_->Merge()); // We have 3 results but only 2 should be returned. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", - /*num_to_return=*/2, - TermMatchType::PREFIX, &impl), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, + /*num_to_return=*/2), IsOkAndHolds(SizeIs(2))); } -TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) { +TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInOneNamespace) { Index::Editor edit1 = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - AlwaysTrueNamespaceCheckerImpl impl; EXPECT_THAT(edit1.BufferTerm("fo"), IsOk()); + EXPECT_THAT(edit1.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk()); Index::Editor edit2 = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/1); - EXPECT_THAT(edit2.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit2.BufferTerm("fool"), IsOk()); EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk()); - Index::Editor edit3 = - index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY, - /*namespace_id=*/2); - EXPECT_THAT(edit3.BufferTerm("fool"), IsOk()); - EXPECT_THAT(edit3.IndexAllBufferedTerms(), IsOk()); - - // Should return "fo", "foo" and "fool" across all namespaces. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", - /*num_to_return=*/10, - TermMatchType::PREFIX, &impl), - IsOkAndHolds(UnorderedElementsAre( - EqualsTermMetadata("fo", 1), EqualsTermMetadata("foo", 1), - EqualsTermMetadata("fool", 1)))); + // namespace with id 0 has 2 results. + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, + /*num_to_return=*/10), + IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fo", 1), + EqualsTermMetadata("foo", 1)))); + // namespace with id 1 has 1 result. + EXPECT_THAT( + index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1}, + /*num_to_return=*/10), + IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fool", 1)))); ICING_ASSERT_OK(index_->Merge()); - // Should return "fo", "foo" and "fool" across all namespaces. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", - /*num_to_return=*/10, - TermMatchType::PREFIX, &impl), + // namespace with id 0 has 2 results. + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, + /*num_to_return=*/10), IsOkAndHolds(UnorderedElementsAre( - EqualsTermMetadata("fo", 1), EqualsTermMetadata("foo", 1), - EqualsTermMetadata("fool", 1)))); + EqualsTermMetadata("fo", kMinSizePlApproxHits), + EqualsTermMetadata("foo", kMinSizePlApproxHits)))); + // namespace with id 1 has 1 result. + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1}, + /*num_to_return=*/10), + IsOkAndHolds(UnorderedElementsAre( + EqualsTermMetadata("fool", kMinSizePlApproxHits)))); } -TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectHitCount) { +TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInMultipleNamespaces) { Index::Editor edit1 = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - AlwaysTrueNamespaceCheckerImpl impl; - EXPECT_THAT(edit1.BufferTerm("foo"), IsOk()); - EXPECT_THAT(edit1.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit1.BufferTerm("fo"), IsOk()); EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk()); Index::Editor edit2 = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY, - /*namespace_id=*/0); - EXPECT_THAT(edit2.BufferTerm("fool"), IsOk()); + /*namespace_id=*/1); + EXPECT_THAT(edit2.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk()); - // 'foo' has 1 hit, 'fool' has 2 hits. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", - /*num_to_return=*/10, - TermMatchType::PREFIX, &impl), - IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 2), - EqualsTermMetadata("foo", 1)))); + Index::Editor edit3 = + index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY, + /*namespace_id=*/2); + EXPECT_THAT(edit3.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit3.IndexAllBufferedTerms(), IsOk()); + + // Should return "foo" and "fool" which are in namespaces with ids 1 and 2. + EXPECT_THAT( + index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1, 2}, + /*num_to_return=*/10), + IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1), + EqualsTermMetadata("fool", 1)))); ICING_ASSERT_OK(index_->Merge()); - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", - /*num_to_return=*/10, - TermMatchType::PREFIX, &impl), - IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 2), - EqualsTermMetadata("foo", 1)))); + EXPECT_THAT( + index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1, 2}, + /*num_to_return=*/10), + IsOkAndHolds(UnorderedElementsAre( + EqualsTermMetadata("foo", kMinSizePlApproxHits), + EqualsTermMetadata("fool", kMinSizePlApproxHits)))); } -TEST_F(IndexTest, FindTermByPrefixShouldReturnInOrder) { - // Push 6 term-six, 5 term-five, 4 term-four, 3 term-three, 2 term-two and one - // term-one into lite index. +TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) { Index::Editor edit1 = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - AlwaysTrueNamespaceCheckerImpl impl; - EXPECT_THAT(edit1.BufferTerm("term-one"), IsOk()); - EXPECT_THAT(edit1.BufferTerm("term-two"), IsOk()); - EXPECT_THAT(edit1.BufferTerm("term-three"), IsOk()); - EXPECT_THAT(edit1.BufferTerm("term-four"), IsOk()); - EXPECT_THAT(edit1.BufferTerm("term-five"), IsOk()); - EXPECT_THAT(edit1.BufferTerm("term-six"), IsOk()); + EXPECT_THAT(edit1.BufferTerm("fo"), IsOk()); EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk()); Index::Editor edit2 = - index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY, - /*namespace_id=*/0); - EXPECT_THAT(edit2.BufferTerm("term-two"), IsOk()); - EXPECT_THAT(edit2.BufferTerm("term-three"), IsOk()); - EXPECT_THAT(edit2.BufferTerm("term-four"), IsOk()); - EXPECT_THAT(edit2.BufferTerm("term-five"), IsOk()); - EXPECT_THAT(edit2.BufferTerm("term-six"), IsOk()); + index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY, + /*namespace_id=*/1); + EXPECT_THAT(edit2.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk()); Index::Editor edit3 = - index_->Edit(kDocumentId3, kSectionId2, TermMatchType::EXACT_ONLY, - /*namespace_id=*/0); - EXPECT_THAT(edit3.BufferTerm("term-three"), IsOk()); - EXPECT_THAT(edit3.BufferTerm("term-four"), IsOk()); - EXPECT_THAT(edit3.BufferTerm("term-five"), IsOk()); - EXPECT_THAT(edit3.BufferTerm("term-six"), IsOk()); + index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY, + /*namespace_id=*/2); + EXPECT_THAT(edit3.BufferTerm("fool"), IsOk()); EXPECT_THAT(edit3.IndexAllBufferedTerms(), IsOk()); - Index::Editor edit4 = - index_->Edit(kDocumentId4, kSectionId2, TermMatchType::EXACT_ONLY, - /*namespace_id=*/0); - EXPECT_THAT(edit4.BufferTerm("term-four"), IsOk()); - EXPECT_THAT(edit4.BufferTerm("term-five"), IsOk()); - EXPECT_THAT(edit4.BufferTerm("term-six"), IsOk()); - EXPECT_THAT(edit4.IndexAllBufferedTerms(), IsOk()); - - Index::Editor edit5 = - index_->Edit(kDocumentId5, kSectionId2, TermMatchType::EXACT_ONLY, - /*namespace_id=*/0); - EXPECT_THAT(edit5.BufferTerm("term-five"), IsOk()); - EXPECT_THAT(edit5.BufferTerm("term-six"), IsOk()); - EXPECT_THAT(edit5.IndexAllBufferedTerms(), IsOk()); - - Index::Editor edit6 = - index_->Edit(kDocumentId6, kSectionId2, TermMatchType::EXACT_ONLY, - /*namespace_id=*/0); - EXPECT_THAT(edit6.BufferTerm("term-six"), IsOk()); - EXPECT_THAT(edit6.IndexAllBufferedTerms(), IsOk()); - - // verify the order in lite index is correct. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t", - /*num_to_return=*/10, - TermMatchType::PREFIX, &impl), - IsOkAndHolds(ElementsAre(EqualsTermMetadata("term-six", 6), - EqualsTermMetadata("term-five", 5), - EqualsTermMetadata("term-four", 4), - EqualsTermMetadata("term-three", 3), - EqualsTermMetadata("term-two", 2), - EqualsTermMetadata("term-one", 1)))); + // Should return "fo", "foo" and "fool" across all namespaces. + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{}, + /*num_to_return=*/10), + IsOkAndHolds(UnorderedElementsAre( + EqualsTermMetadata("fo", 1), EqualsTermMetadata("foo", 1), + EqualsTermMetadata("fool", 1)))); ICING_ASSERT_OK(index_->Merge()); - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t", - /*num_to_return=*/10, - TermMatchType::PREFIX, &impl), - IsOkAndHolds(ElementsAre(EqualsTermMetadata("term-six", 6), - EqualsTermMetadata("term-five", 5), - EqualsTermMetadata("term-four", 4), - EqualsTermMetadata("term-three", 3), - EqualsTermMetadata("term-two", 2), - EqualsTermMetadata("term-one", 1)))); - - // keep push terms to the lite index. We will add 2 document to term-five, - // term-three and term-one. The output order should be 5-6-3-4-1-2. - Index::Editor edit7 = - index_->Edit(kDocumentId7, kSectionId2, TermMatchType::EXACT_ONLY, - /*namespace_id=*/0); - EXPECT_THAT(edit7.BufferTerm("term-one"), IsOk()); - EXPECT_THAT(edit7.BufferTerm("term-three"), IsOk()); - EXPECT_THAT(edit7.BufferTerm("term-five"), IsOk()); - EXPECT_THAT(edit7.IndexAllBufferedTerms(), IsOk()); - - Index::Editor edit8 = - index_->Edit(kDocumentId8, kSectionId2, TermMatchType::EXACT_ONLY, - /*namespace_id=*/0); - EXPECT_THAT(edit8.BufferTerm("term-one"), IsOk()); - EXPECT_THAT(edit8.BufferTerm("term-three"), IsOk()); - EXPECT_THAT(edit8.BufferTerm("term-five"), IsOk()); - EXPECT_THAT(edit8.IndexAllBufferedTerms(), IsOk()); - - // verify the combination of lite index and main index is in correct order. - EXPECT_THAT( - index_->FindTermsByPrefix(/*prefix=*/"t", /*num_to_return=*/10, - TermMatchType::PREFIX, &impl), - IsOkAndHolds(ElementsAre( - EqualsTermMetadata("term-five", 7), EqualsTermMetadata("term-six", 6), - EqualsTermMetadata("term-three", 5), - EqualsTermMetadata("term-four", 4), EqualsTermMetadata("term-one", 3), - EqualsTermMetadata("term-two", 2)))); - - // Get the first three terms. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t", - /*num_to_return=*/3, - TermMatchType::PREFIX, &impl), - IsOkAndHolds(ElementsAre(EqualsTermMetadata("term-five", 7), - EqualsTermMetadata("term-six", 6), - EqualsTermMetadata("term-three", 5)))); + // Should return "fo", "foo" and "fool" across all namespaces. + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{}, + /*num_to_return=*/10), + IsOkAndHolds(UnorderedElementsAre( + EqualsTermMetadata("fo", kMinSizePlApproxHits), + EqualsTermMetadata("foo", kMinSizePlApproxHits), + EqualsTermMetadata("fool", kMinSizePlApproxHits)))); } -TEST_F(IndexTest, FindTermByPrefix_InTermMatchTypePrefix_ShouldReturnInOrder) { +TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectHitCount) { Index::Editor edit1 = - index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX, + index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - AlwaysTrueNamespaceCheckerImpl impl; - EXPECT_THAT(edit1.BufferTerm("fo"), IsOk()); + EXPECT_THAT(edit1.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit1.BufferTerm("fool"), IsOk()); EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk()); Index::Editor edit2 = - index_->Edit(kDocumentId2, kSectionId2, TermMatchType::PREFIX, + index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - EXPECT_THAT(edit2.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit2.BufferTerm("fool"), IsOk()); EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk()); - Index::Editor edit3 = - index_->Edit(kDocumentId3, kSectionId2, TermMatchType::PREFIX, - /*namespace_id=*/0); - EXPECT_THAT(edit3.BufferTerm("fool"), IsOk()); - EXPECT_THAT(edit3.IndexAllBufferedTerms(), IsOk()); + // 'foo' has 1 hit, 'fool' has 2 hits. + EXPECT_THAT( + index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, + /*num_to_return=*/10), + IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1), + EqualsTermMetadata("fool", 2)))); ICING_ASSERT_OK(index_->Merge()); - // verify the order in pls is correct - // "fo" { {doc0, exact_hit}, {doc1, prefix_hit}, {doc2, prefix_hit} } - // "foo" { {doc1, exact_hit}, {doc2, prefix_hit} } - // "fool" { {doc2, exact_hit} } - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", - /*num_to_return=*/10, - TermMatchType::PREFIX, &impl), - IsOkAndHolds(ElementsAre(EqualsTermMetadata("fo", 3), - EqualsTermMetadata("foo", 2), - EqualsTermMetadata("fool", 1)))); - // Find by exact only, all terms should be equally. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*num_to_return=*/10, - TermMatchType::EXACT_ONLY, &impl), + + // foo's one hit should fit on a min-sized pl, fool's two hits should also fit + // on a min-sized pl. + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, + /*num_to_return=*/10), IsOkAndHolds(UnorderedElementsAre( - EqualsTermMetadata("fo", 1), EqualsTermMetadata("foo", 1), - EqualsTermMetadata("fool", 1)))); + EqualsTermMetadata("foo", kMinSizePlApproxHits), + EqualsTermMetadata("fool", kMinSizePlApproxHits)))); } -TEST_F(IndexTest, FindTermByPrefixShouldReturnHitCountForMain) { +TEST_F(IndexTest, FindTermByPrefixShouldReturnApproximateHitCountForMain) { Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - AlwaysTrueNamespaceCheckerImpl impl; EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); @@ -1243,26 +1160,26 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnHitCountForMain) { EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); // 'foo' has 1 hit, 'fool' has 8 hits. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", - /*num_to_return=*/10, - TermMatchType::PREFIX, &impl), - IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 8), - EqualsTermMetadata("foo", 1)))); - - ICING_ASSERT_OK(index_->Merge()); - EXPECT_THAT( - index_->FindTermsByPrefix(/*prefix=*/"f", /*num_to_return=*/10, - TermMatchType::PREFIX, &impl), + index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, + /*num_to_return=*/10), IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1), EqualsTermMetadata("fool", 8)))); + + ICING_ASSERT_OK(index_->Merge()); + + // foo's hits should fit on a single pl. fool's hits will need two pls. + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, + /*num_to_return=*/10), + IsOkAndHolds(UnorderedElementsAre( + EqualsTermMetadata("foo", kMinSizePlApproxHits), + EqualsTermMetadata("fool", kSecondSmallestPlApproxHits)))); } TEST_F(IndexTest, FindTermByPrefixShouldReturnCombinedHitCount) { Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - AlwaysTrueNamespaceCheckerImpl impl; EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); @@ -1274,18 +1191,19 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnCombinedHitCount) { EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*num_to_return=*/10, - TermMatchType::PREFIX, &impl), - IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 2), - EqualsTermMetadata("foo", 1)))); + // 'foo' has 1 hit in the main index, 'fool' has 1 hit in the main index and + // 1 hit in the lite index. + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, + /*num_to_return=*/10), + IsOkAndHolds(UnorderedElementsAre( + EqualsTermMetadata("foo", kMinSizePlApproxHits), + EqualsTermMetadata("fool", kMinSizePlApproxHits + 1)))); } TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsFromBothIndices) { Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - AlwaysTrueNamespaceCheckerImpl impl; - EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); @@ -1297,11 +1215,11 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsFromBothIndices) { EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); // 'foo' has 1 hit in the main index, 'fool' has 1 hit in the lite index. - EXPECT_THAT( - index_->FindTermsByPrefix(/*prefix=*/"f", /*num_to_return=*/10, - TermMatchType::PREFIX, &impl), - IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1), - EqualsTermMetadata("fool", 1)))); + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, + /*num_to_return=*/10), + IsOkAndHolds(UnorderedElementsAre( + EqualsTermMetadata("foo", kMinSizePlApproxHits), + EqualsTermMetadata("fool", 1)))); } TEST_F(IndexTest, GetElementsSize) { @@ -1395,14 +1313,12 @@ TEST_F(IndexTest, GetDebugInfo) { EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX, /*namespace_id=*/0); - index_->set_last_added_document_id(kDocumentId1); ASSERT_THAT(edit.BufferTerm("foot"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK(index_->Merge()); edit = index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - index_->set_last_added_document_id(kDocumentId2); ASSERT_THAT(edit.BufferTerm("footer"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::PREFIX, @@ -1410,45 +1326,40 @@ TEST_F(IndexTest, GetDebugInfo) { ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); - IndexDebugInfoProto out0 = index_->GetDebugInfo(/*verbosity=*/0); - EXPECT_FALSE(out0.main_index_info().has_flash_index_storage_info()); - EXPECT_THAT(out0.main_index_info().last_added_document_id(), - Eq(kDocumentId1)); - EXPECT_THAT(out0.lite_index_info().curr_size(), Eq(2)); - EXPECT_THAT(out0.lite_index_info().last_added_document_id(), - Eq(kDocumentId2)); + std::string out0; + index_->GetDebugInfo(/*verbosity=*/0, &out0); + EXPECT_THAT(out0, Not(IsEmpty())); - IndexDebugInfoProto out1 = index_->GetDebugInfo(/*verbosity=*/1); - EXPECT_THAT(out1.main_index_info().flash_index_storage_info(), - Not(IsEmpty())); + std::string out1; + index_->GetDebugInfo(/*verbosity=*/1, &out1); + EXPECT_THAT(out1, SizeIs(Gt(out0.size()))); // Add one more doc to the lite index. Debug strings should change. edit = index_->Edit(kDocumentId3, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); - index_->set_last_added_document_id(kDocumentId3); ASSERT_THAT(edit.BufferTerm("far"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); - IndexDebugInfoProto out2 = index_->GetDebugInfo(/*verbosity=*/0); - EXPECT_THAT(out2.lite_index_info().curr_size(), Eq(3)); - EXPECT_THAT(out2.lite_index_info().last_added_document_id(), - Eq(kDocumentId3)); + std::string out2; + index_->GetDebugInfo(/*verbosity=*/0, &out2); + EXPECT_THAT(out2, Ne(out0)); + + std::string out3; + index_->GetDebugInfo(/*verbosity=*/1, &out3); + EXPECT_THAT(out3, Ne(out1)); // Merge into the man index. Debuug strings should change again. ICING_ASSERT_OK(index_->Merge()); - IndexDebugInfoProto out3 = index_->GetDebugInfo(/*verbosity=*/0); - EXPECT_TRUE(out3.has_index_storage_info()); - EXPECT_THAT(out3.main_index_info().lexicon_info(), Not(IsEmpty())); - EXPECT_THAT(out3.main_index_info().last_added_document_id(), - Eq(kDocumentId3)); - EXPECT_THAT(out3.lite_index_info().curr_size(), Eq(0)); - EXPECT_THAT(out3.lite_index_info().hit_buffer_size(), Gt(0)); - EXPECT_THAT(out3.lite_index_info().last_added_document_id(), - Eq(kInvalidDocumentId)); - EXPECT_THAT(out3.lite_index_info().searchable_end(), Eq(0)); - EXPECT_THAT(out3.lite_index_info().index_crc(), Gt(0)); - EXPECT_THAT(out3.lite_index_info().lexicon_info(), Not(IsEmpty())); + std::string out4; + index_->GetDebugInfo(/*verbosity=*/0, &out4); + EXPECT_THAT(out4, Ne(out0)); + EXPECT_THAT(out4, Ne(out2)); + + std::string out5; + index_->GetDebugInfo(/*verbosity=*/1, &out5); + EXPECT_THAT(out5, Ne(out1)); + EXPECT_THAT(out5, Ne(out3)); } TEST_F(IndexTest, BackfillingMultipleTermsSucceeds) { diff --git a/icing/index/iterator/doc-hit-info-iterator-and.cc b/icing/index/iterator/doc-hit-info-iterator-and.cc index 543e9ef..66f87bd 100644 --- a/icing/index/iterator/doc-hit-info-iterator-and.cc +++ b/icing/index/iterator/doc-hit-info-iterator-and.cc @@ -14,7 +14,8 @@ #include "icing/index/iterator/doc-hit-info-iterator-and.h" -#include <cstddef> +#include <stddef.h> + #include <cstdint> #include <memory> #include <string> @@ -161,7 +162,6 @@ libtextclassifier3::Status DocHitInfoIteratorAndNary::Advance() { DocumentId unused; ICING_ASSIGN_OR_RETURN( unused, AdvanceTo(iterator.get(), potential_document_id)); - (void)unused; // Silence unused warning. } if (iterator->doc_hit_info().document_id() == potential_document_id) { diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc index 7c6d924..43a846b 100644 --- a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc @@ -48,13 +48,13 @@ using ::testing::ElementsAreArray; using ::testing::Eq; using ::testing::IsEmpty; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = + PropertyConfigProto_Cardinality_Code_OPTIONAL; -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = + StringIndexingConfig_TokenizerType_Code_PLAIN; -constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; +constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; class DocHitInfoIteratorSectionRestrictTest : public ::testing::Test { protected: diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.cc b/icing/index/lite/doc-hit-info-iterator-term-lite.cc index f215d63..d535d7f 100644 --- a/icing/index/lite/doc-hit-info-iterator-term-lite.cc +++ b/icing/index/lite/doc-hit-info-iterator-term-lite.cc @@ -45,13 +45,8 @@ libtextclassifier3::Status DocHitInfoIteratorTermLite::Advance() { if (cached_hits_idx_ == -1) { libtextclassifier3::Status status = RetrieveMoreHits(); if (!status.ok()) { - if (!absl_ports::IsNotFound(status)) { - // NOT_FOUND is expected to happen (not every term will be in the main - // index!). Other errors are worth logging. - ICING_LOG(ERROR) - << "Encountered unexpected failure while retrieving hits " - << status.error_message(); - } + ICING_LOG(ERROR) << "Failed to retrieve more hits " + << status.error_message(); return absl_ports::ResourceExhaustedError( "No more DocHitInfos in iterator"); } @@ -77,8 +72,7 @@ libtextclassifier3::Status DocHitInfoIteratorTermLiteExact::RetrieveMoreHits() { ICING_ASSIGN_OR_RETURN(uint32_t term_id, term_id_codec_->EncodeTvi(tvi, TviType::LITE)); lite_index_->AppendHits(term_id, section_restrict_mask_, - /*only_from_prefix_sections=*/false, - /*namespace_checker=*/nullptr, &cached_hits_); + /*only_from_prefix_sections=*/false, &cached_hits_); cached_hits_idx_ = 0; return libtextclassifier3::Status::OK; } @@ -101,7 +95,7 @@ DocHitInfoIteratorTermLitePrefix::RetrieveMoreHits() { term_id_codec_->EncodeTvi(it.GetValueIndex(), TviType::LITE)); lite_index_->AppendHits(term_id, section_restrict_mask_, /*only_from_prefix_sections=*/!exact_match, - /*namespace_checker=*/nullptr, &cached_hits_); + &cached_hits_); ++terms_matched; } if (terms_matched > 1) { diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.h b/icing/index/lite/doc-hit-info-iterator-term-lite.h index 179fc93..8dbe043 100644 --- a/icing/index/lite/doc-hit-info-iterator-term-lite.h +++ b/icing/index/lite/doc-hit-info-iterator-term-lite.h @@ -82,11 +82,6 @@ class DocHitInfoIteratorTermLite : public DocHitInfoIterator { protected: // Add DocHitInfos corresponding to term_ to cached_hits_. - // - // Returns: - // - OK, on success - // - NOT_FOUND if no term matching term_ was found in the lexicon. - // - INVALID_ARGUMENT if unable to properly encode the termid virtual libtextclassifier3::Status RetrieveMoreHits() = 0; const std::string term_; diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc index a5c6baf..fb23934 100644 --- a/icing/index/lite/lite-index.cc +++ b/icing/index/lite/lite-index.cc @@ -14,11 +14,12 @@ #include "icing/index/lite/lite-index.h" +#include <inttypes.h> +#include <stddef.h> +#include <stdint.h> #include <sys/mman.h> #include <algorithm> -#include <cinttypes> -#include <cstddef> #include <cstdint> #include <memory> #include <string> @@ -336,12 +337,9 @@ libtextclassifier3::StatusOr<uint32_t> LiteIndex::GetTermId( int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask, bool only_from_prefix_sections, - const NamespaceChecker* namespace_checker, std::vector<DocHitInfo>* hits_out) { int count = 0; DocumentId last_document_id = kInvalidDocumentId; - // Record whether the last document belongs to the given namespaces. - bool last_document_in_namespace = false; for (uint32_t idx = Seek(term_id); idx < header_->cur_size(); idx++) { TermIdHitPair term_id_hit_pair( hit_buffer_.array_cast<TermIdHitPair>()[idx]); @@ -358,31 +356,22 @@ int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask, } DocumentId document_id = hit.document_id(); if (document_id != last_document_id) { - last_document_id = document_id; - last_document_in_namespace = - namespace_checker == nullptr || - namespace_checker->BelongsToTargetNamespaces(document_id); - if (!last_document_in_namespace) { - // The document is removed or expired or not belongs to target - // namespaces. - continue; - } ++count; if (hits_out != nullptr) { hits_out->push_back(DocHitInfo(document_id)); } + last_document_id = document_id; } - if (hits_out != nullptr && last_document_in_namespace) { + if (hits_out != nullptr) { hits_out->back().UpdateSection(hit.section_id(), hit.term_frequency()); } } return count; } -libtextclassifier3::StatusOr<int> LiteIndex::CountHits( - uint32_t term_id, const NamespaceChecker* namespace_checker) { +int LiteIndex::CountHits(uint32_t term_id) { return AppendHits(term_id, kSectionIdMaskAll, - /*only_from_prefix_sections=*/false, namespace_checker, + /*only_from_prefix_sections=*/false, /*hits_out=*/nullptr); } @@ -391,16 +380,15 @@ bool LiteIndex::is_full() const { lexicon_.min_free_fraction() < (1.0 - kTrieFullFraction)); } -IndexDebugInfoProto::LiteIndexDebugInfoProto LiteIndex::GetDebugInfo( - int verbosity) { - IndexDebugInfoProto::LiteIndexDebugInfoProto res; - res.set_curr_size(header_->cur_size()); - res.set_hit_buffer_size(options_.hit_buffer_size); - res.set_last_added_document_id(header_->last_added_docid()); - res.set_searchable_end(header_->searchable_end()); - res.set_index_crc(ComputeChecksum().Get()); - lexicon_.GetDebugInfo(verbosity, res.mutable_lexicon_info()); - return res; +void LiteIndex::GetDebugInfo(int verbosity, std::string* out) const { + absl_ports::StrAppend( + out, IcingStringUtil::StringPrintf("Lite Index\nHit buffer %u/%u\n", + header_->cur_size(), + options_.hit_buffer_size)); + + // Lexicon. + out->append("Lexicon stats:\n"); + lexicon_.GetDebugInfo(verbosity, out); } libtextclassifier3::StatusOr<int64_t> LiteIndex::GetElementsSize() const { @@ -421,8 +409,12 @@ IndexStorageInfoProto LiteIndex::GetStorageInfo( IndexStorageInfoProto storage_info) const { int64_t header_and_hit_buffer_file_size = filesystem_->GetFileSize(hit_buffer_fd_.get()); - storage_info.set_lite_index_hit_buffer_size( - IcingFilesystem::SanitizeFileSize(header_and_hit_buffer_file_size)); + if (header_and_hit_buffer_file_size != Filesystem::kBadFileSize) { + storage_info.set_lite_index_hit_buffer_size( + header_and_hit_buffer_file_size); + } else { + storage_info.set_lite_index_hit_buffer_size(-1); + } int64_t lexicon_disk_usage = lexicon_.GetElementsSize(); if (lexicon_disk_usage != Filesystem::kBadFileSize) { storage_info.set_lite_index_lexicon_size(lexicon_disk_usage); diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h index 378fc94..b134aba 100644 --- a/icing/index/lite/lite-index.h +++ b/icing/index/lite/lite-index.h @@ -37,12 +37,10 @@ #include "icing/legacy/index/icing-lite-index-header.h" #include "icing/legacy/index/icing-lite-index-options.h" #include "icing/legacy/index/icing-mmapper.h" -#include "icing/proto/debug.pb.h" #include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" -#include "icing/store/namespace-checker.h" #include "icing/store/namespace-id.h" #include "icing/util/bit-util.h" #include "icing/util/crc32.h" @@ -142,19 +140,13 @@ class LiteIndex { // skipping hits in non-prefix sections if only_from_prefix_sections is true, // to hits_out. If hits_out is nullptr, no hits will be added. // - // Only those hits which belongs to the given namespaces will be counted and - // appended. A nullptr namespace checker will disable this check. - // // Returns the number of hits that would be added to hits_out. int AppendHits(uint32_t term_id, SectionIdMask section_id_mask, bool only_from_prefix_sections, - const NamespaceChecker* namespace_checker, std::vector<DocHitInfo>* hits_out); // Returns the hit count of the term. - // Only those hits which belongs to the given namespaces will be counted. - libtextclassifier3::StatusOr<int> CountHits( - uint32_t term_id, const NamespaceChecker* namespace_checker); + int CountHits(uint32_t term_id); // Check if buffer has reached its capacity. bool is_full() const; @@ -242,7 +234,7 @@ class LiteIndex { // Returns debug information for the index in out. // verbosity <= 0, simplest debug information - size of lexicon, hit buffer // verbosity > 0, more detailed debug information from the lexicon. - IndexDebugInfoProto::LiteIndexDebugInfoProto GetDebugInfo(int verbosity); + void GetDebugInfo(int verbosity, std::string* out) const; // Returns the byte size of all the elements held in the index. This excludes // the size of any internal metadata of the index, e.g. the index's header. diff --git a/icing/index/lite/lite-index_test.cc b/icing/index/lite/lite-index_test.cc deleted file mode 100644 index 825f830..0000000 --- a/icing/index/lite/lite-index_test.cc +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright (C) 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "icing/index/lite/lite-index.h" - -#include <vector> - -#include "gmock/gmock.h" -#include "gtest/gtest.h" -#include "icing/index/term-id-codec.h" -#include "icing/legacy/index/icing-mock-filesystem.h" -#include "icing/schema/section.h" -#include "icing/store/namespace-checker.h" -#include "icing/testing/common-matchers.h" -#include "icing/testing/tmp-directory.h" - -namespace icing { -namespace lib { - -namespace { - -using ::testing::Eq; -using ::testing::IsEmpty; -using ::testing::SizeIs; - -class AlwaysFalseNamespaceCheckerImpl : public NamespaceChecker { - public: - bool BelongsToTargetNamespaces(DocumentId document_id) const override { - return false; - } -}; - -class LiteIndexTest : public testing::Test { - protected: - void SetUp() override { - index_dir_ = GetTestTempDir() + "/test_dir"; - ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str())); - - std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; - LiteIndex::Options options(lite_index_file_name, - /*hit_buffer_want_merge_bytes=*/1024 * 1024); - ICING_ASSERT_OK_AND_ASSIGN(lite_index_, - LiteIndex::Create(options, &icing_filesystem_)); - - ICING_ASSERT_OK_AND_ASSIGN( - term_id_codec_, - TermIdCodec::Create( - IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), - IcingDynamicTrie::max_value_index(options.lexicon_options))); - } - - void TearDown() override { - ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str())); - } - - std::string index_dir_; - Filesystem filesystem_; - IcingFilesystem icing_filesystem_; - std::unique_ptr<LiteIndex> lite_index_; - std::unique_ptr<TermIdCodec> term_id_codec_; -}; - -constexpr NamespaceId kNamespace0 = 0; - -TEST_F(LiteIndexTest, LiteIndexAppendHits) { - ICING_ASSERT_OK_AND_ASSIGN( - uint32_t tvi, - lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0)); - ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, - term_id_codec_->EncodeTvi(tvi, TviType::LITE)); - Hit doc_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); - Hit doc_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); - ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit0)); - ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit1)); - - std::vector<DocHitInfo> hits1; - lite_index_->AppendHits(foo_term_id, kSectionIdMaskAll, - /*only_from_prefix_sections=*/false, - /*namespace_checker=*/nullptr, &hits1); - EXPECT_THAT(hits1, SizeIs(1)); - EXPECT_THAT(hits1.back().document_id(), Eq(0)); - // Check that the hits are coming from section 0 and section 1. - EXPECT_THAT(hits1.back().hit_section_ids_mask(), Eq(0b11)); - - std::vector<DocHitInfo> hits2; - AlwaysFalseNamespaceCheckerImpl always_false_namespace_checker; - lite_index_->AppendHits(foo_term_id, kSectionIdMaskAll, - /*only_from_prefix_sections=*/false, - &always_false_namespace_checker, &hits2); - // Check that no hits are returned because they get skipped by the namespace - // checker. - EXPECT_THAT(hits2, IsEmpty()); -} - -} // namespace -} // namespace lib -} // namespace icing diff --git a/icing/index/main/doc-hit-info-iterator-term-main.cc b/icing/index/main/doc-hit-info-iterator-term-main.cc index 98bc18e..5553c1e 100644 --- a/icing/index/main/doc-hit-info-iterator-term-main.cc +++ b/icing/index/main/doc-hit-info-iterator-term-main.cc @@ -57,9 +57,8 @@ libtextclassifier3::Status DocHitInfoIteratorTermMain::Advance() { if (!absl_ports::IsNotFound(status)) { // NOT_FOUND is expected to happen (not every term will be in the main // index!). Other errors are worth logging. - ICING_LOG(ERROR) - << "Encountered unexpected failure while retrieving hits " - << status.error_message(); + ICING_LOG(ERROR) << "Failed to retrieve more hits " + << status.error_message(); } return absl_ports::ResourceExhaustedError( "No more DocHitInfos in iterator"); diff --git a/icing/index/main/flash-index-storage.cc b/icing/index/main/flash-index-storage.cc index 3c52375..f125b6d 100644 --- a/icing/index/main/flash-index-storage.cc +++ b/icing/index/main/flash-index-storage.cc @@ -14,11 +14,11 @@ #include "icing/index/main/flash-index-storage.h" +#include <errno.h> +#include <inttypes.h> #include <sys/types.h> #include <algorithm> -#include <cerrno> -#include <cinttypes> #include <cstdint> #include <memory> #include <unordered_set> diff --git a/icing/index/main/flash-index-storage.h b/icing/index/main/flash-index-storage.h index 6c6fbb8..8d5b50b 100644 --- a/icing/index/main/flash-index-storage.h +++ b/icing/index/main/flash-index-storage.h @@ -159,7 +159,6 @@ class FlashIndexStorage { libtextclassifier3::Status Reset(); - // TODO(b/222349894) Convert the string output to a protocol buffer instead. void GetDebugInfo(int verbosity, std::string* out) const; private: diff --git a/icing/index/main/flash-index-storage_test.cc b/icing/index/main/flash-index-storage_test.cc index 25fcaad..7e15524 100644 --- a/icing/index/main/flash-index-storage_test.cc +++ b/icing/index/main/flash-index-storage_test.cc @@ -14,10 +14,10 @@ #include "icing/index/main/flash-index-storage.h" +#include <stdlib.h> #include <unistd.h> #include <algorithm> -#include <cstdlib> #include <limits> #include <utility> #include <vector> diff --git a/icing/index/main/index-block.cc b/icing/index/main/index-block.cc index c6ab345..4590d06 100644 --- a/icing/index/main/index-block.cc +++ b/icing/index/main/index-block.cc @@ -14,8 +14,9 @@ #include "icing/index/main/index-block.h" +#include <inttypes.h> + #include <algorithm> -#include <cinttypes> #include <limits> #include "icing/text_classifier/lib3/utils/base/statusor.h" diff --git a/icing/index/main/index-block.h b/icing/index/main/index-block.h index 5d75a2a..edf9a79 100644 --- a/icing/index/main/index-block.h +++ b/icing/index/main/index-block.h @@ -15,10 +15,10 @@ #ifndef ICING_INDEX_MAIN_INDEX_BLOCK_H_ #define ICING_INDEX_MAIN_INDEX_BLOCK_H_ +#include <string.h> #include <sys/mman.h> #include <algorithm> -#include <cstring> #include <limits> #include <memory> #include <string> diff --git a/icing/index/main/main-index.cc b/icing/index/main/main-index.cc index 2d6007b..8ae6b27 100644 --- a/icing/index/main/main-index.cc +++ b/icing/index/main/main-index.cc @@ -133,10 +133,18 @@ libtextclassifier3::StatusOr<int64_t> MainIndex::GetElementsSize() const { IndexStorageInfoProto MainIndex::GetStorageInfo( IndexStorageInfoProto storage_info) const { - storage_info.set_main_index_lexicon_size( - IcingFilesystem::SanitizeFileSize(main_lexicon_->GetElementsSize())); - storage_info.set_main_index_storage_size( - Filesystem::SanitizeFileSize(flash_index_storage_->GetElementsSize())); + int64_t lexicon_elt_size = main_lexicon_->GetElementsSize(); + if (lexicon_elt_size != IcingFilesystem::kBadFileSize) { + storage_info.set_main_index_lexicon_size(lexicon_elt_size); + } else { + storage_info.set_main_index_lexicon_size(-1); + } + int64_t index_elt_size = flash_index_storage_->GetElementsSize(); + if (lexicon_elt_size != IcingFilesystem::kBadFileSize) { + storage_info.set_main_index_storage_size(index_elt_size); + } else { + storage_info.set_main_index_storage_size(-1); + } storage_info.set_main_index_block_size(flash_index_storage_->block_size()); storage_info.set_num_blocks(flash_index_storage_->num_blocks()); storage_info.set_min_free_fraction(flash_index_storage_->min_free_fraction()); @@ -178,7 +186,7 @@ MainIndex::GetAccessorForPrefixTerm(const std::string& prefix) { if (!exact && !hits_in_prefix_section.HasProperty(main_itr.GetValueIndex())) { // Found it, but it doesn't have prefix hits. Exit early. No need to // retrieve the posting list because there's nothing there for us. - return absl_ports::NotFoundError("The term doesn't have any prefix hits."); + return libtextclassifier3::Status::OK; } PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid; memcpy(&posting_list_id, main_itr.GetValue(), sizeof(posting_list_id)); @@ -209,48 +217,46 @@ bool IsTermInNamespaces( libtextclassifier3::StatusOr<std::vector<TermMetadata>> MainIndex::FindTermsByPrefix(const std::string& prefix, - TermMatchType::Code term_match_type, - const NamespaceChecker* namespace_checker) { + const std::vector<NamespaceId>& namespace_ids, + int num_to_return) { // Finds all the terms that start with the given prefix in the lexicon. IcingDynamicTrie::Iterator term_iterator(*main_lexicon_, prefix.c_str()); + // A property reader to help check if a term has some property. + IcingDynamicTrie::PropertyReadersAll property_reader(*main_lexicon_); + std::vector<TermMetadata> term_metadata_list; - while (term_iterator.IsValid()) { - int count = 0; - DocumentId last_document_id = kInvalidDocumentId; + while (term_iterator.IsValid() && term_metadata_list.size() < num_to_return) { + uint32_t term_value_index = term_iterator.GetValueIndex(); + // Skips the terms that don't exist in the given namespaces. We won't skip + // any terms if namespace_ids is empty. + if (!IsTermInNamespaces(property_reader, term_value_index, namespace_ids)) { + term_iterator.Advance(); + continue; + } PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid; memcpy(&posting_list_id, term_iterator.GetValue(), sizeof(posting_list_id)); - ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor, - PostingListAccessor::CreateFromExisting( - flash_index_storage_.get(), posting_list_id)); - ICING_ASSIGN_OR_RETURN(std::vector<Hit> hits, - pl_accessor.GetNextHitsBatch()); - for (const Hit& hit : hits) { - DocumentId document_id = hit.document_id(); - if (document_id != last_document_id) { - last_document_id = document_id; - if (term_match_type == TermMatchType::EXACT_ONLY && - hit.is_prefix_hit()) { - continue; - } - if (!namespace_checker->BelongsToTargetNamespaces(document_id)) { - // The document is removed or expired or not belongs to target - // namespaces. - continue; - } - // TODO(b/152934343) Add search type in SuggestionSpec to ask user to - // input search type, prefix or exact. And make different score strategy - // base on that. - ++count; - } - } - if (count > 0) { - term_metadata_list.push_back(TermMetadata(term_iterator.GetKey(), count)); - } + // Getting the actual hit count would require reading the entire posting + // list chain. We take an approximation to avoid all of those IO ops. + // Because we are not reading the posting lists, it is impossible to + // differentiate between single max-size posting lists and chains of + // max-size posting lists. We assume that the impact on scoring is not + // significant. + int approx_hit_count = IndexBlock::ApproximateFullPostingListHitsForBlock( + flash_index_storage_->block_size(), + posting_list_id.posting_list_index_bits()); + term_metadata_list.emplace_back(term_iterator.GetKey(), approx_hit_count); term_iterator.Advance(); } + if (term_iterator.IsValid()) { + // We exited the loop above because we hit the num_to_return limit. + ICING_LOG(WARNING) << "Ran into limit of " << num_to_return + << " retrieving suggestions for " << prefix + << ". Some suggestions may not be returned and others " + "may be misranked."; + } return term_metadata_list; } @@ -607,22 +613,16 @@ libtextclassifier3::Status MainIndex::AddPrefixBackfillHits( return libtextclassifier3::Status::OK; } -IndexDebugInfoProto::MainIndexDebugInfoProto MainIndex::GetDebugInfo( - int verbosity) const { - IndexDebugInfoProto::MainIndexDebugInfoProto res; - +void MainIndex::GetDebugInfo(int verbosity, std::string* out) const { // Lexicon. - main_lexicon_->GetDebugInfo(verbosity, res.mutable_lexicon_info()); - - res.set_last_added_document_id(last_added_document_id()); + out->append("Main Lexicon stats:\n"); + main_lexicon_->GetDebugInfo(verbosity, out); if (verbosity <= 0) { - return res; + return; } - flash_index_storage_->GetDebugInfo(verbosity, - res.mutable_flash_index_storage_info()); - return res; + flash_index_storage_->GetDebugInfo(verbosity, out); } } // namespace lib diff --git a/icing/index/main/main-index.h b/icing/index/main/main-index.h index abb0418..43635ca 100644 --- a/icing/index/main/main-index.h +++ b/icing/index/main/main-index.h @@ -27,9 +27,7 @@ #include "icing/index/term-metadata.h" #include "icing/legacy/index/icing-dynamic-trie.h" #include "icing/legacy/index/icing-filesystem.h" -#include "icing/proto/debug.pb.h" #include "icing/proto/storage.pb.h" -#include "icing/store/namespace-checker.h" #include "icing/store/namespace-id.h" #include "icing/util/status-macros.h" @@ -73,17 +71,18 @@ class MainIndex { // Finds terms with the given prefix in the given namespaces. If // 'namespace_ids' is empty, returns results from all the namespaces. The // input prefix must be normalized, otherwise inaccurate results may be - // returned. If term_match_type is EXACT, only exact hit will be counted and - // it is PREFIX, both prefix and exact hits will be counted. Results are not - // sorted specifically and are in lexigraphical order. Number of results are - // no more than 'num_to_return'. + // returned. Results are not sorted specifically and are in lexigraphical + // order. Number of results are no more than 'num_to_return'. + // + // The hit count returned with each TermMetadata is an approximation based of + // posting list size. // // Returns: // A list of TermMetadata on success // INTERNAL_ERROR if failed to access term data. libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix( - const std::string& prefix, TermMatchType::Code term_match_type, - const NamespaceChecker* namespace_checker); + const std::string& prefix, const std::vector<NamespaceId>& namespace_ids, + int num_to_return); struct LexiconMergeOutputs { // Maps from main_lexicon tvi for new branching point to the main_lexicon @@ -186,8 +185,7 @@ class MainIndex { // verbosity <= 0, simplest debug information - just the lexicon // verbosity > 0, more detailed debug information including raw postings // lists. - IndexDebugInfoProto::MainIndexDebugInfoProto GetDebugInfo( - int verbosity) const; + void GetDebugInfo(int verbosity, std::string* out) const; private: libtextclassifier3::Status Init(const std::string& index_directory, diff --git a/icing/index/main/main-index_test.cc b/icing/index/main/main-index_test.cc index fa83d68..74139be 100644 --- a/icing/index/main/main-index_test.cc +++ b/icing/index/main/main-index_test.cc @@ -162,34 +162,6 @@ TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixReturnsValidAccessor) { EXPECT_THAT(main_index->GetAccessorForPrefixTerm("foo"), IsOk()); } -TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixReturnsNotFound) { - // 1. Index one doc in the Lite Index: - // - Doc0 {"foot" is_in_prefix_section=false} - ICING_ASSERT_OK_AND_ASSIGN( - uint32_t tvi, - lite_index_->InsertTerm("foot", TermMatchType::EXACT_ONLY, kNamespace0)); - ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id, - term_id_codec_->EncodeTvi(tvi, TviType::LITE)); - - Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); - ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); - - // 2. Create the main index. It should have no entries in its lexicon. - std::string main_index_file_name = index_dir_ + "/test_file.idx.index"; - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<MainIndex> main_index, - MainIndex::Create(main_index_file_name, &filesystem_, - &icing_filesystem_)); - - // 3. Merge the index. The main index should return not found when we search - // prefix contain "foo". - ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get())); - // GetAccessorForPrefixTerm should return a valid accessor for "foo". - EXPECT_THAT(main_index->GetAccessorForPrefixTerm("foo"), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); -} - TEST_F(MainIndexTest, MainIndexGetAccessorForExactTermNotFound) { // Create the main index. It should have no entries in its lexicon. std::string main_index_file_name = index_dir_ + "/test_file.idx.index"; diff --git a/icing/index/main/posting-list-free.h b/icing/index/main/posting-list-free.h index 75b99d7..4b27401 100644 --- a/icing/index/main/posting-list-free.h +++ b/icing/index/main/posting-list-free.h @@ -15,10 +15,10 @@ #ifndef ICING_INDEX_MAIN_POSTING_LIST_FREE_H_ #define ICING_INDEX_MAIN_POSTING_LIST_FREE_H_ +#include <string.h> #include <sys/mman.h> #include <cstdint> -#include <cstring> #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" @@ -115,7 +115,7 @@ class PostingListFree { // bytes which will store the next posting list index, the rest are unused and // can be anything. uint8_t *posting_list_buffer_; - [[maybe_unused]] uint32_t size_in_bytes_; + uint32_t size_in_bytes_; static_assert(sizeof(PostingListIndex) <= posting_list_utils::min_posting_list_size(), diff --git a/icing/index/main/posting-list-used.h b/icing/index/main/posting-list-used.h index 8944034..1b2e24e 100644 --- a/icing/index/main/posting-list-used.h +++ b/icing/index/main/posting-list-used.h @@ -15,10 +15,10 @@ #ifndef ICING_INDEX_MAIN_POSTING_LIST_USED_H_ #define ICING_INDEX_MAIN_POSTING_LIST_USED_H_ +#include <string.h> #include <sys/mman.h> #include <algorithm> -#include <cstring> #include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" diff --git a/icing/jni/icing-search-engine-jni.cc b/icing/jni/icing-search-engine-jni.cc index bcc35e6..ea2bcf7 100644 --- a/icing/jni/icing-search-engine-jni.cc +++ b/icing/jni/icing-search-engine-jni.cc @@ -166,7 +166,6 @@ Java_com_google_android_icing_IcingSearchEngine_nativeGetSchemaType( env->GetStringUTFChars(schema_type, /*isCopy=*/nullptr); icing::lib::GetSchemaTypeResultProto get_schema_type_result_proto = icing->GetSchemaType(native_schema_type); - env->ReleaseStringUTFChars(schema_type, native_schema_type); return SerializeProtoToJniByteArray(env, get_schema_type_result_proto); } @@ -193,20 +192,19 @@ JNIEXPORT jbyteArray JNICALL Java_com_google_android_icing_IcingSearchEngine_nativeGet( JNIEnv* env, jclass clazz, jobject object, jstring name_space, jstring uri, jbyteArray result_spec_bytes) { - icing::lib::GetResultSpecProto get_result_spec; - if (!ParseProtoFromJniByteArray(env, result_spec_bytes, &get_result_spec)) { - ICING_LOG(ERROR) << "Failed to parse GetResultSpecProto in nativeGet"; - return nullptr; - } icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); + const char* native_name_space = env->GetStringUTFChars(name_space, /*isCopy=*/nullptr); const char* native_uri = env->GetStringUTFChars(uri, /*isCopy=*/nullptr); + icing::lib::GetResultSpecProto get_result_spec; + if (!ParseProtoFromJniByteArray(env, result_spec_bytes, &get_result_spec)) { + ICING_LOG(ERROR) << "Failed to parse GetResultSpecProto in nativeGet"; + return nullptr; + } icing::lib::GetResultProto get_result_proto = icing->Get(native_name_space, native_uri, get_result_spec); - env->ReleaseStringUTFChars(uri, native_uri); - env->ReleaseStringUTFChars(name_space, native_name_space); return SerializeProtoToJniByteArray(env, get_result_proto); } @@ -308,8 +306,6 @@ Java_com_google_android_icing_IcingSearchEngine_nativeDelete( const char* native_uri = env->GetStringUTFChars(uri, /*isCopy=*/nullptr); icing::lib::DeleteResultProto delete_result_proto = icing->Delete(native_name_space, native_uri); - env->ReleaseStringUTFChars(uri, native_uri); - env->ReleaseStringUTFChars(name_space, native_name_space); return SerializeProtoToJniByteArray(env, delete_result_proto); } @@ -324,7 +320,6 @@ Java_com_google_android_icing_IcingSearchEngine_nativeDeleteByNamespace( env->GetStringUTFChars(name_space, /*isCopy=*/nullptr); icing::lib::DeleteByNamespaceResultProto delete_by_namespace_result_proto = icing->DeleteByNamespace(native_name_space); - env->ReleaseStringUTFChars(name_space, native_name_space); return SerializeProtoToJniByteArray(env, delete_by_namespace_result_proto); } @@ -339,7 +334,6 @@ Java_com_google_android_icing_IcingSearchEngine_nativeDeleteBySchemaType( env->GetStringUTFChars(schema_type, /*isCopy=*/nullptr); icing::lib::DeleteBySchemaTypeResultProto delete_by_schema_type_result_proto = icing->DeleteBySchemaType(native_schema_type); - env->ReleaseStringUTFChars(schema_type, native_schema_type); return SerializeProtoToJniByteArray(env, delete_by_schema_type_result_proto); } @@ -426,23 +420,4 @@ Java_com_google_android_icing_IcingSearchEngine_nativeReset( return SerializeProtoToJniByteArray(env, reset_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeSearchSuggestions( - JNIEnv* env, jclass clazz, jobject object, - jbyteArray suggestion_spec_bytes) { - icing::lib::IcingSearchEngine* icing = - GetIcingSearchEnginePointer(env, object); - - icing::lib::SuggestionSpecProto suggestion_spec_proto; - if (!ParseProtoFromJniByteArray(env, suggestion_spec_bytes, - &suggestion_spec_proto)) { - ICING_LOG(ERROR) << "Failed to parse SuggestionSpecProto in nativeSearch"; - return nullptr; - } - icing::lib::SuggestionResponse suggestionResponse = - icing->SearchSuggestions(suggestion_spec_proto); - - return SerializeProtoToJniByteArray(env, suggestionResponse); -} - } // extern "C" diff --git a/icing/legacy/core/icing-core-types.h b/icing/legacy/core/icing-core-types.h index 7db8408..cc12663 100644 --- a/icing/legacy/core/icing-core-types.h +++ b/icing/legacy/core/icing-core-types.h @@ -21,8 +21,9 @@ #ifndef ICING_LEGACY_CORE_ICING_CORE_TYPES_H_ #define ICING_LEGACY_CORE_ICING_CORE_TYPES_H_ +#include <stdint.h> + #include <cstddef> // size_t not defined implicitly for all platforms. -#include <cstdint> #include <vector> #include "icing/legacy/core/icing-compat.h" diff --git a/icing/legacy/core/icing-string-util.cc b/icing/legacy/core/icing-string-util.cc index ed06e03..2eb64ac 100644 --- a/icing/legacy/core/icing-string-util.cc +++ b/icing/legacy/core/icing-string-util.cc @@ -13,11 +13,12 @@ // limitations under the License. #include "icing/legacy/core/icing-string-util.h" +#include <stdarg.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> + #include <algorithm> -#include <cstdarg> -#include <cstddef> -#include <cstdint> -#include <cstdio> #include <string> #include "icing/legacy/portable/icing-zlib.h" diff --git a/icing/legacy/core/icing-string-util.h b/icing/legacy/core/icing-string-util.h index e5e4941..767e581 100644 --- a/icing/legacy/core/icing-string-util.h +++ b/icing/legacy/core/icing-string-util.h @@ -15,8 +15,9 @@ #ifndef ICING_LEGACY_CORE_ICING_STRING_UTIL_H_ #define ICING_LEGACY_CORE_ICING_STRING_UTIL_H_ -#include <cstdarg> -#include <cstdint> +#include <stdarg.h> +#include <stdint.h> + #include <string> #include "icing/legacy/core/icing-compat.h" diff --git a/icing/legacy/core/icing-timer.h b/icing/legacy/core/icing-timer.h index af38912..49ba9ad 100644 --- a/icing/legacy/core/icing-timer.h +++ b/icing/legacy/core/icing-timer.h @@ -16,8 +16,7 @@ #define ICING_LEGACY_CORE_ICING_TIMER_H_ #include <sys/time.h> - -#include <ctime> +#include <time.h> namespace icing { namespace lib { diff --git a/icing/legacy/index/icing-array-storage.cc b/icing/legacy/index/icing-array-storage.cc index 4d2ef67..b462135 100644 --- a/icing/legacy/index/icing-array-storage.cc +++ b/icing/legacy/index/icing-array-storage.cc @@ -14,10 +14,10 @@ #include "icing/legacy/index/icing-array-storage.h" +#include <inttypes.h> #include <sys/mman.h> #include <algorithm> -#include <cinttypes> #include "icing/legacy/core/icing-string-util.h" #include "icing/legacy/core/icing-timer.h" diff --git a/icing/legacy/index/icing-array-storage.h b/icing/legacy/index/icing-array-storage.h index 0d93172..fad0565 100644 --- a/icing/legacy/index/icing-array-storage.h +++ b/icing/legacy/index/icing-array-storage.h @@ -20,7 +20,8 @@ #ifndef ICING_LEGACY_INDEX_ICING_ARRAY_STORAGE_H_ #define ICING_LEGACY_INDEX_ICING_ARRAY_STORAGE_H_ -#include <cstdint> +#include <stdint.h> + #include <string> #include <vector> diff --git a/icing/legacy/index/icing-bit-util.h b/icing/legacy/index/icing-bit-util.h index d0c3f50..3273a68 100644 --- a/icing/legacy/index/icing-bit-util.h +++ b/icing/legacy/index/icing-bit-util.h @@ -20,8 +20,9 @@ #ifndef ICING_LEGACY_INDEX_ICING_BIT_UTIL_H_ #define ICING_LEGACY_INDEX_ICING_BIT_UTIL_H_ -#include <cstdint> -#include <cstdio> +#include <stdint.h> +#include <stdio.h> + #include <limits> #include <vector> diff --git a/icing/legacy/index/icing-dynamic-trie.cc b/icing/legacy/index/icing-dynamic-trie.cc index 77876c4..29843ba 100644 --- a/icing/legacy/index/icing-dynamic-trie.cc +++ b/icing/legacy/index/icing-dynamic-trie.cc @@ -62,16 +62,15 @@ #include "icing/legacy/index/icing-dynamic-trie.h" +#include <errno.h> #include <fcntl.h> +#include <inttypes.h> +#include <string.h> #include <sys/mman.h> #include <sys/stat.h> #include <unistd.h> #include <algorithm> -#include <cerrno> -#include <cinttypes> -#include <cstdint> -#include <cstring> #include <memory> #include <utility> @@ -398,8 +397,6 @@ class IcingDynamicTrie::IcingDynamicTrieStorage { // storage. IcingScopedFd array_fds_[NUM_ARRAY_TYPES]; std::vector<IcingArrayStorage> array_storage_; - - // Legacy file system. Switch to use the new Filesystem class instead. const IcingFilesystem *filesystem_; }; @@ -1367,12 +1364,10 @@ uint32_t IcingDynamicTrie::size() const { return storage_->hdr().num_keys(); } -void IcingDynamicTrie::CollectStatsRecursive(const Node &node, Stats *stats, - uint32_t depth) const { +void IcingDynamicTrie::CollectStatsRecursive(const Node &node, + Stats *stats) const { if (node.is_leaf()) { stats->num_leaves++; - stats->sum_depth += depth; - stats->max_depth = max(stats->max_depth, depth); const char *suffix = storage_->GetSuffix(node.next_index()); stats->suffixes_used += strlen(suffix) + 1 + value_size(); if (!suffix[0]) { @@ -1384,16 +1379,13 @@ void IcingDynamicTrie::CollectStatsRecursive(const Node &node, Stats *stats, for (; i < (1U << node.log2_num_children()); i++) { const Next &next = *storage_->GetNext(node.next_index(), i); if (next.node_index() == kInvalidNodeIndex) break; - CollectStatsRecursive(*storage_->GetNode(next.node_index()), stats, - depth + 1); + CollectStatsRecursive(*storage_->GetNode(next.node_index()), stats); } // At least one valid node in each next array if (i == 0) { ICING_LOG(FATAL) << "No valid node in 'next' array"; } - stats->sum_children += i; - stats->max_children = max(stats->max_children, i); stats->child_counts[i - 1]++; stats->wasted[node.log2_num_children()] += @@ -1475,12 +1467,9 @@ std::string IcingDynamicTrie::Stats::DumpStats(int verbosity) const { "Wasted total: %u\n" "Num intermediates %u num leaves %u " "suffixes used %u null %u\n" - "avg and max children for intermediates: %.3f, %u\n" - "avg and max depth for leaves: %.3f, %u\n" "Total next frag: %.3f%%\n", total_wasted, num_intermediates, num_leaves, suffixes_used, - null_suffixes, 1. * sum_children / num_intermediates, max_children, - 1. * sum_depth / num_leaves, max_depth, + null_suffixes, 100. * math_util::SafeDivide((total_free + total_wasted), num_nexts)); } IcingStringUtil::SStringAppendF( diff --git a/icing/legacy/index/icing-dynamic-trie.h b/icing/legacy/index/icing-dynamic-trie.h index 013b926..7fe290b 100644 --- a/icing/legacy/index/icing-dynamic-trie.h +++ b/icing/legacy/index/icing-dynamic-trie.h @@ -35,7 +35,8 @@ #ifndef ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_ #define ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_ -#include <cstdint> +#include <stdint.h> + #include <memory> #include <string> #include <unordered_map> @@ -152,13 +153,8 @@ class IcingDynamicTrie : public IIcingStorage { uint32_t max_nodes; // Count of intermediate nodes. uint32_t num_intermediates; - // Total and maximum number of children of intermediate nodes. - uint32_t sum_children, max_children; - // Count of leaf nodes. uint32_t num_leaves; - // Total and maximum depth of leaf nodes. - uint32_t sum_depth, max_depth; // Next stats @@ -191,7 +187,6 @@ class IcingDynamicTrie : public IIcingStorage { uint32_t dirty_pages_nexts; uint32_t dirty_pages_suffixes; - // TODO(b/222349894) Convert the string output to a protocol buffer instead. std::string DumpStats(int verbosity) const; }; @@ -607,8 +602,7 @@ class IcingDynamicTrie : public IIcingStorage { static const uint32_t kInvalidSuffixIndex; // Stats helpers. - void CollectStatsRecursive(const Node &node, Stats *stats, - uint32_t depth = 0) const; + void CollectStatsRecursive(const Node &node, Stats *stats) const; // Helpers for Find and Insert. const Next *GetNextByChar(const Node *node, uint8_t key_char) const; diff --git a/icing/legacy/index/icing-filesystem.cc b/icing/legacy/index/icing-filesystem.cc index 4f5e571..90e9146 100644 --- a/icing/legacy/index/icing-filesystem.cc +++ b/icing/legacy/index/icing-filesystem.cc @@ -16,6 +16,7 @@ #include <dirent.h> #include <dlfcn.h> +#include <errno.h> #include <fcntl.h> #include <fnmatch.h> #include <pthread.h> @@ -26,7 +27,6 @@ #include <unistd.h> #include <algorithm> -#include <cerrno> #include <unordered_set> #include "icing/absl_ports/str_cat.h" diff --git a/icing/legacy/index/icing-filesystem.h b/icing/legacy/index/icing-filesystem.h index ce75a82..f645632 100644 --- a/icing/legacy/index/icing-filesystem.h +++ b/icing/legacy/index/icing-filesystem.h @@ -224,11 +224,6 @@ class IcingFilesystem { // Increments to_increment by size if size is valid, or sets to_increment // to kBadFileSize if either size or to_increment is kBadFileSize. static void IncrementByOrSetInvalid(uint64_t size, uint64_t *to_increment); - - // Return -1 if file_size is invalid. Otherwise, return file_size. - static int64_t SanitizeFileSize(int64_t file_size) { - return (file_size != kBadFileSize) ? file_size : -1; - } }; } // namespace lib diff --git a/icing/legacy/index/icing-flash-bitmap.h b/icing/legacy/index/icing-flash-bitmap.h index 6bb9591..3b3521a 100644 --- a/icing/legacy/index/icing-flash-bitmap.h +++ b/icing/legacy/index/icing-flash-bitmap.h @@ -37,7 +37,8 @@ #ifndef ICING_LEGACY_INDEX_ICING_FLASH_BITMAP_H_ #define ICING_LEGACY_INDEX_ICING_FLASH_BITMAP_H_ -#include <cstdint> +#include <stdint.h> + #include <memory> #include <string> @@ -138,7 +139,6 @@ class IcingFlashBitmap { // Upgrade for version 18. bool UpgradeTo18(); - // Legacy file system. Switch to use the new Filesystem class instead. const IcingFilesystem *const filesystem_; std::string filename_; OpenType open_type_; diff --git a/icing/legacy/index/icing-mmapper.cc b/icing/legacy/index/icing-mmapper.cc index 7946c82..737335c 100644 --- a/icing/legacy/index/icing-mmapper.cc +++ b/icing/legacy/index/icing-mmapper.cc @@ -17,11 +17,10 @@ // #include "icing/legacy/index/icing-mmapper.h" +#include <errno.h> +#include <string.h> #include <sys/mman.h> -#include <cerrno> -#include <cstring> - #include "icing/legacy/core/icing-string-util.h" #include "icing/legacy/index/icing-filesystem.h" #include "icing/util/logging.h" diff --git a/icing/legacy/index/icing-mock-filesystem.h b/icing/legacy/index/icing-mock-filesystem.h index 122ee7b..75ac62f 100644 --- a/icing/legacy/index/icing-mock-filesystem.h +++ b/icing/legacy/index/icing-mock-filesystem.h @@ -15,15 +15,16 @@ #ifndef ICING_LEGACY_INDEX_ICING_MOCK_FILESYSTEM_H_ #define ICING_LEGACY_INDEX_ICING_MOCK_FILESYSTEM_H_ -#include <cstdint> -#include <cstdio> -#include <cstring> +#include <stdint.h> +#include <stdio.h> +#include <string.h> + #include <memory> #include <string> #include <vector> -#include "gmock/gmock.h" #include "icing/legacy/index/icing-filesystem.h" +#include "gmock/gmock.h" namespace icing { namespace lib { diff --git a/icing/legacy/index/icing-storage-file.cc b/icing/legacy/index/icing-storage-file.cc index 35a4418..b27ec67 100644 --- a/icing/legacy/index/icing-storage-file.cc +++ b/icing/legacy/index/icing-storage-file.cc @@ -14,9 +14,9 @@ #include "icing/legacy/index/icing-storage-file.h" +#include <inttypes.h> #include <unistd.h> -#include <cinttypes> #include <string> #include "icing/legacy/core/icing-compat.h" diff --git a/icing/portable/endian.h b/icing/portable/endian.h index ecebb15..42f6c02 100644 --- a/icing/portable/endian.h +++ b/icing/portable/endian.h @@ -12,12 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. // -// Utility functions that depend on bytesex. We define versions of htonll and -// ntohll (HostToNetworkLL and NetworkToHostLL in our naming), as well as -// "Google" versions of all the standards: ghtonl, ghtons, and so on -// (GHostToNetworkL, GHostToNetworkS, etc in our naming). These functions do -// exactly the same as their standard variants, but don't require including the -// dangerous netinet/in.h. +// Utility functions that depend on bytesex. We define htonll and ntohll, +// as well as "Google" versions of all the standards: ghtonl, ghtons, and +// so on. These functions do exactly the same as their standard variants, +// but don't require including the dangerous netinet/in.h. #ifndef ICING_PORTABLE_ENDIAN_H_ #define ICING_PORTABLE_ENDIAN_H_ @@ -77,7 +75,7 @@ // The following guarantees declaration of the byte swap functions #ifdef COMPILER_MSVC -#include <cstdlib> // NOLINT(build/include) +#include <stdlib.h> // NOLINT(build/include) #define bswap_16(x) _byteswap_ushort(x) #define bswap_32(x) _byteswap_ulong(x) @@ -172,37 +170,37 @@ inline uint16 gbswap_16(uint16 host_int) { return bswap_16(host_int); } // correctly handle the (rather involved) definitions of bswap_32. // gcc guarantees that inline functions are as fast as macros, so // this isn't a performance hit. -inline uint16_t GHostToNetworkS(uint16_t x) { return gbswap_16(x); } -inline uint32_t GHostToNetworkL(uint32_t x) { return gbswap_32(x); } -inline uint64_t GHostToNetworkLL(uint64_t x) { return gbswap_64(x); } +inline uint16_t ghtons(uint16_t x) { return gbswap_16(x); } +inline uint32_t ghtonl(uint32_t x) { return gbswap_32(x); } +inline uint64_t ghtonll(uint64_t x) { return gbswap_64(x); } #elif defined IS_BIG_ENDIAN // These definitions are simpler on big-endian machines // These are functions instead of macros to avoid self-assignment warnings // on calls such as "i = ghtnol(i);". This also provides type checking. -inline uint16 GHostToNetworkS(uint16 x) { return x; } -inline uint32 GHostToNetworkL(uint32 x) { return x; } -inline uint64 GHostToNetworkLL(uint64 x) { return x; } +inline uint16 ghtons(uint16 x) { return x; } +inline uint32 ghtonl(uint32 x) { return x; } +inline uint64 ghtonll(uint64 x) { return x; } #else // bytesex #error \ "Unsupported bytesex: Either IS_BIG_ENDIAN or IS_LITTLE_ENDIAN must be defined" // NOLINT #endif // bytesex -#ifndef HostToNetworkLL +#ifndef htonll // With the rise of 64-bit, some systems are beginning to define this. -#define HostToNetworkLL(x) GHostToNetworkLL(x) -#endif // HostToNetworkLL +#define htonll(x) ghtonll(x) +#endif // htonll // ntoh* and hton* are the same thing for any size and bytesex, // since the function is an involution, i.e., its own inverse. -inline uint16_t GNetworkToHostS(uint16_t x) { return GHostToNetworkS(x); } -inline uint32_t GNetworkToHostL(uint32_t x) { return GHostToNetworkL(x); } -inline uint64_t GNetworkToHostLL(uint64_t x) { return GHostToNetworkLL(x); } +inline uint16_t gntohs(uint16_t x) { return ghtons(x); } +inline uint32_t gntohl(uint32_t x) { return ghtonl(x); } +inline uint64_t gntohll(uint64_t x) { return ghtonll(x); } -#ifndef NetworkToHostLL -#define NetworkToHostLL(x) GHostToNetworkLL(x) -#endif // NetworkToHostLL +#ifndef ntohll +#define ntohll(x) htonll(x) +#endif // ntohll #endif // ICING_PORTABLE_ENDIAN_H_ diff --git a/icing/portable/gzip_stream.cc b/icing/portable/gzip_stream.cc deleted file mode 100644 index f00a993..0000000 --- a/icing/portable/gzip_stream.cc +++ /dev/null @@ -1,313 +0,0 @@ -// Copyright (C) 2009 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This file contains the implementation of classes GzipInputStream and -// GzipOutputStream. It is forked from protobuf because these classes are only -// provided in libprotobuf-full but we would like to link libicing against the -// smaller libprotobuf-lite instead. - -#include "icing/portable/gzip_stream.h" -#include "icing/util/logging.h" - -namespace icing { -namespace lib { -namespace protobuf_ports { - -static const int kDefaultBufferSize = 65536; - -GzipInputStream::GzipInputStream(ZeroCopyInputStream* sub_stream, Format format, - int buffer_size) - : format_(format), sub_stream_(sub_stream), zerror_(Z_OK), byte_count_(0) { - zcontext_.state = Z_NULL; - zcontext_.zalloc = Z_NULL; - zcontext_.zfree = Z_NULL; - zcontext_.opaque = Z_NULL; - zcontext_.total_out = 0; - zcontext_.next_in = NULL; - zcontext_.avail_in = 0; - zcontext_.total_in = 0; - zcontext_.msg = NULL; - if (buffer_size == -1) { - output_buffer_length_ = kDefaultBufferSize; - } else { - output_buffer_length_ = buffer_size; - } - output_buffer_ = operator new(output_buffer_length_); - zcontext_.next_out = static_cast<Bytef*>(output_buffer_); - zcontext_.avail_out = output_buffer_length_; - output_position_ = output_buffer_; -} -GzipInputStream::~GzipInputStream() { - operator delete(output_buffer_); - zerror_ = inflateEnd(&zcontext_); -} - -static inline int internalInflateInit2(z_stream* zcontext, - GzipInputStream::Format format) { - int windowBitsFormat = 0; - switch (format) { - case GzipInputStream::GZIP: - windowBitsFormat = 16; - break; - case GzipInputStream::AUTO: - windowBitsFormat = 32; - break; - case GzipInputStream::ZLIB: - windowBitsFormat = 0; - break; - } - return inflateInit2(zcontext, /* windowBits */ 15 | windowBitsFormat); -} - -int GzipInputStream::Inflate(int flush) { - if ((zerror_ == Z_OK) && (zcontext_.avail_out == 0)) { - // previous inflate filled output buffer. don't change input params yet. - } else if (zcontext_.avail_in == 0) { - const void* in; - int in_size; - bool first = zcontext_.next_in == NULL; - bool ok = sub_stream_->Next(&in, &in_size); - if (!ok) { - zcontext_.next_out = NULL; - zcontext_.avail_out = 0; - return Z_STREAM_END; - } - zcontext_.next_in = static_cast<Bytef*>(const_cast<void*>(in)); - zcontext_.avail_in = in_size; - if (first) { - int error = internalInflateInit2(&zcontext_, format_); - if (error != Z_OK) { - return error; - } - } - } - zcontext_.next_out = static_cast<Bytef*>(output_buffer_); - zcontext_.avail_out = output_buffer_length_; - output_position_ = output_buffer_; - int error = inflate(&zcontext_, flush); - return error; -} - -void GzipInputStream::DoNextOutput(const void** data, int* size) { - *data = output_position_; - *size = ((uintptr_t)zcontext_.next_out) - ((uintptr_t)output_position_); - output_position_ = zcontext_.next_out; -} - -// implements ZeroCopyInputStream ---------------------------------- -bool GzipInputStream::Next(const void** data, int* size) { - bool ok = (zerror_ == Z_OK) || (zerror_ == Z_STREAM_END) || - (zerror_ == Z_BUF_ERROR); - if ((!ok) || (zcontext_.next_out == NULL)) { - return false; - } - if (zcontext_.next_out != output_position_) { - DoNextOutput(data, size); - return true; - } - if (zerror_ == Z_STREAM_END) { - if (zcontext_.next_out != NULL) { - // sub_stream_ may have concatenated streams to follow - zerror_ = inflateEnd(&zcontext_); - byte_count_ += zcontext_.total_out; - if (zerror_ != Z_OK) { - return false; - } - zerror_ = internalInflateInit2(&zcontext_, format_); - if (zerror_ != Z_OK) { - return false; - } - } else { - *data = NULL; - *size = 0; - return false; - } - } - zerror_ = Inflate(Z_NO_FLUSH); - if ((zerror_ == Z_STREAM_END) && (zcontext_.next_out == NULL)) { - // The underlying stream's Next returned false inside Inflate. - return false; - } - ok = (zerror_ == Z_OK) || (zerror_ == Z_STREAM_END) || - (zerror_ == Z_BUF_ERROR); - if (!ok) { - return false; - } - DoNextOutput(data, size); - return true; -} -void GzipInputStream::BackUp(int count) { - output_position_ = reinterpret_cast<void*>( - reinterpret_cast<uintptr_t>(output_position_) - count); -} -bool GzipInputStream::Skip(int count) { - const void* data; - int size = 0; - bool ok = Next(&data, &size); - while (ok && (size < count)) { - count -= size; - ok = Next(&data, &size); - } - if (size > count) { - BackUp(size - count); - } - return ok; -} -int64_t GzipInputStream::ByteCount() const { - int64_t ret = byte_count_ + zcontext_.total_out; - if (zcontext_.next_out != NULL && output_position_ != NULL) { - ret += reinterpret_cast<uintptr_t>(zcontext_.next_out) - - reinterpret_cast<uintptr_t>(output_position_); - } - return ret; -} - -// ========================================================================= - -GzipOutputStream::Options::Options() - : format(GZIP), - buffer_size(kDefaultBufferSize), - compression_level(Z_DEFAULT_COMPRESSION), - compression_strategy(Z_DEFAULT_STRATEGY) {} - -GzipOutputStream::GzipOutputStream(ZeroCopyOutputStream* sub_stream) { - Init(sub_stream, Options()); -} - -GzipOutputStream::GzipOutputStream(ZeroCopyOutputStream* sub_stream, - const Options& options) { - Init(sub_stream, options); -} - -void GzipOutputStream::Init(ZeroCopyOutputStream* sub_stream, - const Options& options) { - sub_stream_ = sub_stream; - sub_data_ = NULL; - sub_data_size_ = 0; - - input_buffer_length_ = options.buffer_size; - input_buffer_ = operator new(input_buffer_length_); - - zcontext_.zalloc = Z_NULL; - zcontext_.zfree = Z_NULL; - zcontext_.opaque = Z_NULL; - zcontext_.next_out = NULL; - zcontext_.avail_out = 0; - zcontext_.total_out = 0; - zcontext_.next_in = NULL; - zcontext_.avail_in = 0; - zcontext_.total_in = 0; - zcontext_.msg = NULL; - // default to GZIP format - int windowBitsFormat = 16; - if (options.format == ZLIB) { - windowBitsFormat = 0; - } - zerror_ = - deflateInit2(&zcontext_, options.compression_level, Z_DEFLATED, - /* windowBits */ 15 | windowBitsFormat, - /* memLevel (default) */ 8, options.compression_strategy); -} - -GzipOutputStream::~GzipOutputStream() { - Close(); - operator delete(input_buffer_); -} - -// private -int GzipOutputStream::Deflate(int flush) { - int error = Z_OK; - do { - if ((sub_data_ == NULL) || (zcontext_.avail_out == 0)) { - bool ok = sub_stream_->Next(&sub_data_, &sub_data_size_); - if (!ok) { - sub_data_ = NULL; - sub_data_size_ = 0; - return Z_BUF_ERROR; - } - if (sub_data_size_ <= 0) { - ICING_LOG(FATAL) << "Failed to advance underlying stream"; - } - zcontext_.next_out = static_cast<Bytef*>(sub_data_); - zcontext_.avail_out = sub_data_size_; - } - error = deflate(&zcontext_, flush); - } while (error == Z_OK && zcontext_.avail_out == 0); - if ((flush == Z_FULL_FLUSH) || (flush == Z_FINISH)) { - // Notify lower layer of data. - sub_stream_->BackUp(zcontext_.avail_out); - // We don't own the buffer anymore. - sub_data_ = NULL; - sub_data_size_ = 0; - } - return error; -} - -// implements ZeroCopyOutputStream --------------------------------- -bool GzipOutputStream::Next(void** data, int* size) { - if ((zerror_ != Z_OK) && (zerror_ != Z_BUF_ERROR)) { - return false; - } - if (zcontext_.avail_in != 0) { - zerror_ = Deflate(Z_NO_FLUSH); - if (zerror_ != Z_OK) { - return false; - } - } - if (zcontext_.avail_in == 0) { - // all input was consumed. reset the buffer. - zcontext_.next_in = static_cast<Bytef*>(input_buffer_); - zcontext_.avail_in = input_buffer_length_; - *data = input_buffer_; - *size = input_buffer_length_; - } else { - // The loop in Deflate should consume all avail_in - ICING_LOG(ERROR) << "Deflate left bytes unconsumed"; - } - return true; -} -void GzipOutputStream::BackUp(int count) { - if (zcontext_.avail_in < static_cast<uInt>(count)) { - ICING_LOG(FATAL) << "Not enough data to back up " << count << " bytes"; - } - zcontext_.avail_in -= count; -} -int64_t GzipOutputStream::ByteCount() const { - return zcontext_.total_in + zcontext_.avail_in; -} - -bool GzipOutputStream::Flush() { - zerror_ = Deflate(Z_FULL_FLUSH); - // Return true if the flush succeeded or if it was a no-op. - return (zerror_ == Z_OK) || - (zerror_ == Z_BUF_ERROR && zcontext_.avail_in == 0 && - zcontext_.avail_out != 0); -} - -bool GzipOutputStream::Close() { - if ((zerror_ != Z_OK) && (zerror_ != Z_BUF_ERROR)) { - return false; - } - do { - zerror_ = Deflate(Z_FINISH); - } while (zerror_ == Z_OK); - zerror_ = deflateEnd(&zcontext_); - bool ok = zerror_ == Z_OK; - zerror_ = Z_STREAM_END; - return ok; -} - -} // namespace protobuf_ports -} // namespace lib -} // namespace icing diff --git a/icing/portable/gzip_stream.h b/icing/portable/gzip_stream.h deleted file mode 100644 index 602093f..0000000 --- a/icing/portable/gzip_stream.h +++ /dev/null @@ -1,181 +0,0 @@ -// Copyright (C) 2009 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This file contains the definition for classes GzipInputStream and -// GzipOutputStream. It is forked from protobuf because these classes are only -// provided in libprotobuf-full but we would like to link libicing against the -// smaller libprotobuf-lite instead. -// -// GzipInputStream decompresses data from an underlying -// ZeroCopyInputStream and provides the decompressed data as a -// ZeroCopyInputStream. -// -// GzipOutputStream is an ZeroCopyOutputStream that compresses data to -// an underlying ZeroCopyOutputStream. - -#ifndef GOOGLE3_ICING_PORTABLE_GZIP_STREAM_H_ -#define GOOGLE3_ICING_PORTABLE_GZIP_STREAM_H_ - -#include <google/protobuf/io/zero_copy_stream_impl_lite.h> -#include "icing/portable/zlib.h" - -namespace icing { -namespace lib { -namespace protobuf_ports { - -// A ZeroCopyInputStream that reads compressed data through zlib -class GzipInputStream : public google::protobuf::io::ZeroCopyInputStream { - public: - // Format key for constructor - enum Format { - // zlib will autodetect gzip header or deflate stream - AUTO = 0, - - // GZIP streams have some extra header data for file attributes. - GZIP = 1, - - // Simpler zlib stream format. - ZLIB = 2, - }; - - // buffer_size and format may be -1 for default of 64kB and GZIP format - explicit GzipInputStream( - google::protobuf::io::ZeroCopyInputStream* sub_stream, - Format format = AUTO, int buffer_size = -1); - virtual ~GzipInputStream(); - - // Return last error message or NULL if no error. - inline const char* ZlibErrorMessage() const { return zcontext_.msg; } - inline int ZlibErrorCode() const { return zerror_; } - - // implements ZeroCopyInputStream ---------------------------------- - bool Next(const void** data, int* size) override; - void BackUp(int count) override; - bool Skip(int count) override; - int64_t ByteCount() const override; - - private: - Format format_; - - google::protobuf::io::ZeroCopyInputStream* sub_stream_; - - z_stream zcontext_; - int zerror_; - - void* output_buffer_; - void* output_position_; - size_t output_buffer_length_; - int64_t byte_count_; - - int Inflate(int flush); - void DoNextOutput(const void** data, int* size); -}; - -class GzipOutputStream : public google::protobuf::io::ZeroCopyOutputStream { - public: - // Format key for constructor - enum Format { - // GZIP streams have some extra header data for file attributes. - GZIP = 1, - - // Simpler zlib stream format. - ZLIB = 2, - }; - - struct Options { - // Defaults to GZIP. - Format format; - - // What size buffer to use internally. Defaults to 64kB. - int buffer_size; - - // A number between 0 and 9, where 0 is no compression and 9 is best - // compression. Defaults to Z_DEFAULT_COMPRESSION (see zlib.h). - int compression_level; - - // Defaults to Z_DEFAULT_STRATEGY. Can also be set to Z_FILTERED, - // Z_HUFFMAN_ONLY, or Z_RLE. See the documentation for deflateInit2 in - // zlib.h for definitions of these constants. - int compression_strategy; - - Options(); // Initializes with default values. - }; - - // Create a GzipOutputStream with default options. - explicit GzipOutputStream( - google::protobuf::io::ZeroCopyOutputStream* sub_stream); - - // Create a GzipOutputStream with the given options. - GzipOutputStream( - google::protobuf::io::ZeroCopyOutputStream* sub_stream, - const Options& options); - - virtual ~GzipOutputStream(); - - // Return last error message or NULL if no error. - inline const char* ZlibErrorMessage() const { return zcontext_.msg; } - inline int ZlibErrorCode() const { return zerror_; } - - // Flushes data written so far to zipped data in the underlying stream. - // It is the caller's responsibility to flush the underlying stream if - // necessary. - // Compression may be less efficient stopping and starting around flushes. - // Returns true if no error. - // - // Please ensure that block size is > 6. Here is an excerpt from the zlib - // doc that explains why: - // - // In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that avail_out - // is greater than six to avoid repeated flush markers due to - // avail_out == 0 on return. - bool Flush(); - - // Writes out all data and closes the gzip stream. - // It is the caller's responsibility to close the underlying stream if - // necessary. - // Returns true if no error. - bool Close(); - - // implements ZeroCopyOutputStream --------------------------------- - bool Next(void** data, int* size) override; - void BackUp(int count) override; - int64_t ByteCount() const override; - - private: - google::protobuf::io::ZeroCopyOutputStream* sub_stream_; - // Result from calling Next() on sub_stream_ - void* sub_data_; - int sub_data_size_; - - z_stream zcontext_; - int zerror_; - void* input_buffer_; - size_t input_buffer_length_; - - // Shared constructor code. - void Init( - google::protobuf::io::ZeroCopyOutputStream* sub_stream, - const Options& options); - - // Do some compression. - // Takes zlib flush mode. - // Returns zlib error code. - int Deflate(int flush); -}; - -} // namespace protobuf_ports -} // namespace lib -} // namespace icing - -#endif // GOOGLE3_ICING_PORTABLE_GZIP_STREAM_H_ diff --git a/icing/query/query-processor.cc b/icing/query/query-processor.cc index 36c76db..1f937fd 100644 --- a/icing/query/query-processor.cc +++ b/icing/query/query-processor.cc @@ -182,7 +182,7 @@ QueryProcessor::ParseRawQuery(const SearchSpecProto& search_spec) { const Token& token = tokens.at(i); std::unique_ptr<DocHitInfoIterator> result_iterator; - // TODO(b/202076890): Handle negation tokens + // TODO(cassiewang): Handle negation tokens switch (token.type) { case Token::Type::QUERY_LEFT_PARENTHESES: { frames.emplace(ParserStateFrame()); diff --git a/icing/query/query-processor_benchmark.cc b/icing/query/query-processor_benchmark.cc index e48fe78..bdd40aa 100644 --- a/icing/query/query-processor_benchmark.cc +++ b/icing/query/query-processor_benchmark.cc @@ -16,6 +16,7 @@ #include "gmock/gmock.h" #include "third_party/absl/flags/flag.h" #include "icing/document-builder.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/index/index.h" #include "icing/proto/term.pb.h" #include "icing/query/query-processor.h" @@ -23,7 +24,6 @@ #include "icing/schema/section.h" #include "icing/store/document-id.h" #include "icing/testing/common-matchers.h" -#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" #include "icing/tokenization/language-segmenter-factory.h" diff --git a/icing/query/query-processor_test.cc b/icing/query/query-processor_test.cc index eaa0efc..daeb479 100644 --- a/icing/query/query-processor_test.cc +++ b/icing/query/query-processor_test.cc @@ -23,6 +23,7 @@ #include "gtest/gtest.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/index.h" #include "icing/index/iterator/doc-hit-info-iterator-test-util.h" @@ -39,7 +40,6 @@ #include "icing/store/document-store.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" -#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/jni-test-helpers.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" @@ -61,30 +61,28 @@ using ::testing::SizeIs; using ::testing::Test; using ::testing::UnorderedElementsAre; -constexpr PropertyConfigProto::DataType::Code TYPE_STRING = - PropertyConfigProto::DataType::STRING; +constexpr PropertyConfigProto_DataType_Code TYPE_STRING = + PropertyConfigProto_DataType_Code_STRING; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = + PropertyConfigProto_Cardinality_Code_OPTIONAL; -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = + StringIndexingConfig_TokenizerType_Code_PLAIN; -constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; +constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; class QueryProcessorTest : public Test { protected: QueryProcessorTest() : test_dir_(GetTestTempDir() + "/icing"), store_dir_(test_dir_ + "/store"), - schema_store_dir_(test_dir_ + "/schema_store"), index_dir_(test_dir_ + "/index") {} void SetUp() override { filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); filesystem_.CreateDirectoryRecursively(index_dir_.c_str()); filesystem_.CreateDirectoryRecursively(store_dir_.c_str()); - filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()); if (!IsCfStringTokenization() && !IsReverseJniTokenization()) { // If we've specified using the reverse-JNI method for segmentation (i.e. @@ -131,7 +129,6 @@ class QueryProcessorTest : public Test { Filesystem filesystem_; const std::string test_dir_; const std::string store_dir_; - const std::string schema_store_dir_; std::unique_ptr<Index> index_; std::unique_ptr<LanguageSegmenter> language_segmenter_; std::unique_ptr<Normalizer> normalizer_; @@ -179,7 +176,7 @@ TEST_F(QueryProcessorTest, EmptyGroupMatchAllDocuments) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -230,7 +227,7 @@ TEST_F(QueryProcessorTest, EmptyQueryMatchAllDocuments) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -281,7 +278,7 @@ TEST_F(QueryProcessorTest, QueryTermNormalized) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -357,7 +354,7 @@ TEST_F(QueryProcessorTest, OneTermPrefixMatch) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -425,7 +422,7 @@ TEST_F(QueryProcessorTest, OneTermExactMatch) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -493,7 +490,7 @@ TEST_F(QueryProcessorTest, AndSameTermExactMatch) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -563,7 +560,7 @@ TEST_F(QueryProcessorTest, AndTwoTermExactMatch) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -638,7 +635,7 @@ TEST_F(QueryProcessorTest, AndSameTermPrefixMatch) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -708,7 +705,7 @@ TEST_F(QueryProcessorTest, AndTwoTermPrefixMatch) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -784,7 +781,7 @@ TEST_F(QueryProcessorTest, AndTwoTermPrefixAndExactMatch) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -860,7 +857,7 @@ TEST_F(QueryProcessorTest, OrTwoTermExactMatch) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -949,7 +946,7 @@ TEST_F(QueryProcessorTest, OrTwoTermPrefixMatch) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -1037,7 +1034,7 @@ TEST_F(QueryProcessorTest, OrTwoTermPrefixAndExactMatch) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -1124,7 +1121,7 @@ TEST_F(QueryProcessorTest, CombinedAndOrTerms) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -1310,7 +1307,7 @@ TEST_F(QueryProcessorTest, OneGroup) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -1386,7 +1383,7 @@ TEST_F(QueryProcessorTest, TwoGroups) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -1464,7 +1461,7 @@ TEST_F(QueryProcessorTest, ManyLevelNestedGrouping) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -1540,7 +1537,7 @@ TEST_F(QueryProcessorTest, OneLevelNestedGrouping) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -1617,7 +1614,7 @@ TEST_F(QueryProcessorTest, ExcludeTerm) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -1682,7 +1679,7 @@ TEST_F(QueryProcessorTest, ExcludeNonexistentTerm) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -1745,7 +1742,7 @@ TEST_F(QueryProcessorTest, ExcludeAnd) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -1835,7 +1832,7 @@ TEST_F(QueryProcessorTest, ExcludeOr) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -1931,7 +1928,7 @@ TEST_F(QueryProcessorTest, DeletedFilter) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -2005,7 +2002,7 @@ TEST_F(QueryProcessorTest, NamespaceFilter) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -2081,7 +2078,7 @@ TEST_F(QueryProcessorTest, SchemaTypeFilter) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -2158,7 +2155,7 @@ TEST_F(QueryProcessorTest, SectionFilterForOneDocument) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -2240,7 +2237,7 @@ TEST_F(QueryProcessorTest, SectionFilterAcrossSchemaTypes) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -2323,7 +2320,7 @@ TEST_F(QueryProcessorTest, SectionFilterWithinSchemaType) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -2407,7 +2404,7 @@ TEST_F(QueryProcessorTest, SectionFilterRespectsDifferentSectionIds) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -2480,7 +2477,7 @@ TEST_F(QueryProcessorTest, NonexistentSectionFilterReturnsEmptyResults) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -2547,7 +2544,7 @@ TEST_F(QueryProcessorTest, UnindexedSectionFilterReturnsEmptyResults) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -2617,7 +2614,7 @@ TEST_F(QueryProcessorTest, SectionFilterTermAndUnrestrictedTerm) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( @@ -2692,7 +2689,7 @@ TEST_F(QueryProcessorTest, DocumentBeforeTtlNotFilteredOut) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); // Arbitrary value, just has to be less than the document's creation @@ -2751,7 +2748,7 @@ TEST_F(QueryProcessorTest, DocumentPastTtlFilteredOut) { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); // Arbitrary value, just has to be greater than the document's creation diff --git a/icing/query/suggestion-processor.cc b/icing/query/suggestion-processor.cc deleted file mode 100644 index cfa53f6..0000000 --- a/icing/query/suggestion-processor.cc +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright (C) 2021 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "icing/query/suggestion-processor.h" - -#include "icing/tokenization/tokenizer-factory.h" -#include "icing/tokenization/tokenizer.h" -#include "icing/transform/normalizer.h" - -namespace icing { -namespace lib { - -libtextclassifier3::StatusOr<std::unique_ptr<SuggestionProcessor>> -SuggestionProcessor::Create(Index* index, - const LanguageSegmenter* language_segmenter, - const Normalizer* normalizer) { - ICING_RETURN_ERROR_IF_NULL(index); - ICING_RETURN_ERROR_IF_NULL(language_segmenter); - - return std::unique_ptr<SuggestionProcessor>( - new SuggestionProcessor(index, language_segmenter, normalizer)); -} - -libtextclassifier3::StatusOr<std::vector<TermMetadata>> -SuggestionProcessor::QuerySuggestions( - const icing::lib::SuggestionSpecProto& suggestion_spec, - const NamespaceChecker* namespace_checker) { - // We use query tokenizer to tokenize the give prefix, and we only use the - // last token to be the suggestion prefix. - ICING_ASSIGN_OR_RETURN( - std::unique_ptr<Tokenizer> tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - StringIndexingConfig::TokenizerType::PLAIN, &language_segmenter_)); - ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator, - tokenizer->Tokenize(suggestion_spec.prefix())); - - // If there are previous tokens, they are prepended to the suggestion, - // separated by spaces. - std::string last_token; - int token_start_pos; - while (iterator->Advance()) { - Token token = iterator->GetToken(); - last_token = token.text; - token_start_pos = token.text.data() - suggestion_spec.prefix().c_str(); - } - - // If the position of the last token is not the end of the prefix, it means - // there should be some operator tokens after it and are ignored by the - // tokenizer. - bool is_last_token = token_start_pos + last_token.length() >= - suggestion_spec.prefix().length(); - - if (!is_last_token || last_token.empty()) { - // We don't have a valid last token, return early. - return std::vector<TermMetadata>(); - } - - std::string query_prefix = - suggestion_spec.prefix().substr(0, token_start_pos); - // Run suggestion based on given SuggestionSpec. - // Normalize token text to lowercase since all tokens in the lexicon are - // lowercase. - ICING_ASSIGN_OR_RETURN( - std::vector<TermMetadata> terms, - index_.FindTermsByPrefix( - normalizer_.NormalizeTerm(last_token), - suggestion_spec.num_to_return(), - suggestion_spec.scoring_spec().scoring_match_type(), - namespace_checker)); - - for (TermMetadata& term : terms) { - term.content = query_prefix + term.content; - } - return terms; -} - -SuggestionProcessor::SuggestionProcessor( - Index* index, const LanguageSegmenter* language_segmenter, - const Normalizer* normalizer) - : index_(*index), - language_segmenter_(*language_segmenter), - normalizer_(*normalizer) {} - -} // namespace lib -} // namespace icing diff --git a/icing/query/suggestion-processor.h b/icing/query/suggestion-processor.h deleted file mode 100644 index 088863e..0000000 --- a/icing/query/suggestion-processor.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (C) 2021 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef ICING_QUERY_SUGGESTION_PROCESSOR_H_ -#define ICING_QUERY_SUGGESTION_PROCESSOR_H_ - -#include "icing/text_classifier/lib3/utils/base/statusor.h" -#include "icing/index/index.h" -#include "icing/proto/search.pb.h" -#include "icing/tokenization/language-segmenter.h" -#include "icing/transform/normalizer.h" - -namespace icing { -namespace lib { - -// Processes SuggestionSpecProtos and retrieves the specified TermMedaData that -// satisfies the prefix and its restrictions. This also performs ranking, and -// returns TermMetaData ordered by their hit count. -class SuggestionProcessor { - public: - // Factory function to create a SuggestionProcessor which does not take - // ownership of any input components, and all pointers must refer to valid - // objects that outlive the created SuggestionProcessor instance. - // - // Returns: - // An SuggestionProcessor on success - // FAILED_PRECONDITION if any of the pointers is null. - static libtextclassifier3::StatusOr<std::unique_ptr<SuggestionProcessor>> - Create(Index* index, const LanguageSegmenter* language_segmenter, - const Normalizer* normalizer); - - // Query suggestions based on the given SuggestionSpecProto. - // - // Returns: - // On success, - // - One vector that represents the entire TermMetadata - // INTERNAL_ERROR on all other errors - libtextclassifier3::StatusOr<std::vector<TermMetadata>> QuerySuggestions( - const SuggestionSpecProto& suggestion_spec, - const NamespaceChecker* namespace_checker); - - private: - explicit SuggestionProcessor(Index* index, - const LanguageSegmenter* language_segmenter, - const Normalizer* normalizer); - - // Not const because we could modify/sort the TermMetaData buffer in the lite - // index. - Index& index_; - const LanguageSegmenter& language_segmenter_; - const Normalizer& normalizer_; -}; - -} // namespace lib -} // namespace icing - -#endif // ICING_QUERY_SUGGESTION_PROCESSOR_H_ diff --git a/icing/query/suggestion-processor_test.cc b/icing/query/suggestion-processor_test.cc deleted file mode 100644 index ba4c90a..0000000 --- a/icing/query/suggestion-processor_test.cc +++ /dev/null @@ -1,326 +0,0 @@ -// Copyright (C) 2021 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "icing/query/suggestion-processor.h" - -#include "gmock/gmock.h" -#include "icing/store/document-store.h" -#include "icing/testing/always-true-namespace-checker-impl.h" -#include "icing/testing/common-matchers.h" -#include "icing/testing/fake-clock.h" -#include "icing/testing/icu-data-file-helper.h" -#include "icing/testing/jni-test-helpers.h" -#include "icing/testing/test-data.h" -#include "icing/testing/tmp-directory.h" -#include "icing/tokenization/language-segmenter-factory.h" -#include "icing/transform/normalizer-factory.h" -#include "unicode/uloc.h" - -namespace icing { -namespace lib { - -namespace { - -using ::testing::IsEmpty; -using ::testing::Test; - -class SuggestionProcessorTest : public Test { - protected: - SuggestionProcessorTest() - : test_dir_(GetTestTempDir() + "/icing"), - store_dir_(test_dir_ + "/store"), - index_dir_(test_dir_ + "/index") {} - - void SetUp() override { - filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); - filesystem_.CreateDirectoryRecursively(index_dir_.c_str()); - filesystem_.CreateDirectoryRecursively(store_dir_.c_str()); - - if (!IsCfStringTokenization() && !IsReverseJniTokenization()) { - // If we've specified using the reverse-JNI method for segmentation (i.e. - // not ICU), then we won't have the ICU data file included to set up. - // Technically, we could choose to use reverse-JNI for segmentation AND - // include an ICU data file, but that seems unlikely and our current BUILD - // setup doesn't do this. - ICING_ASSERT_OK( - // File generated via icu_data_file rule in //icing/BUILD. - icu_data_file_helper::SetUpICUDataFile( - GetTestFilePath("icing/icu.dat"))); - } - - Index::Options options(index_dir_, - /*index_merge_size=*/1024 * 1024); - ICING_ASSERT_OK_AND_ASSIGN( - index_, Index::Create(options, &filesystem_, &icing_filesystem_)); - - language_segmenter_factory::SegmenterOptions segmenter_options( - ULOC_US, jni_cache_.get()); - ICING_ASSERT_OK_AND_ASSIGN( - language_segmenter_, - language_segmenter_factory::Create(segmenter_options)); - - ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create( - /*max_term_byte_size=*/1000)); - - ICING_ASSERT_OK_AND_ASSIGN( - schema_store_, - SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); - - ICING_ASSERT_OK_AND_ASSIGN( - DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_, - schema_store_.get())); - } - - libtextclassifier3::Status AddTokenToIndex( - DocumentId document_id, SectionId section_id, - TermMatchType::Code term_match_type, const std::string& token) { - Index::Editor editor = index_->Edit(document_id, section_id, - term_match_type, /*namespace_id=*/0); - auto status = editor.BufferTerm(token.c_str()); - return status.ok() ? editor.IndexAllBufferedTerms() : status; - } - - void TearDown() override { - filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); - } - - Filesystem filesystem_; - const std::string test_dir_; - const std::string store_dir_; - std::unique_ptr<Index> index_; - std::unique_ptr<LanguageSegmenter> language_segmenter_; - std::unique_ptr<Normalizer> normalizer_; - std::unique_ptr<SchemaStore> schema_store_; - std::unique_ptr<const JniCache> jni_cache_ = GetTestJniCache(); - FakeClock fake_clock_; - - private: - IcingFilesystem icing_filesystem_; - const std::string index_dir_; -}; - -constexpr DocumentId kDocumentId0 = 0; -constexpr SectionId kSectionId2 = 2; - -TEST_F(SuggestionProcessorTest, PrependedPrefixTokenTest) { - ASSERT_THAT(AddTokenToIndex(kDocumentId0, kSectionId2, - TermMatchType::EXACT_ONLY, "foo"), - IsOk()); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SuggestionProcessor> suggestion_processor, - SuggestionProcessor::Create(index_.get(), language_segmenter_.get(), - normalizer_.get())); - - SuggestionSpecProto suggestion_spec; - suggestion_spec.set_prefix( - "prefix token should be prepended to the suggestion f"); - suggestion_spec.set_num_to_return(10); - - AlwaysTrueNamespaceCheckerImpl impl; - ICING_ASSERT_OK_AND_ASSIGN( - std::vector<TermMetadata> terms, - suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); - EXPECT_THAT(terms.at(0).content, - "prefix token should be prepended to the suggestion foo"); -} - -TEST_F(SuggestionProcessorTest, NonExistentPrefixTest) { - ASSERT_THAT(AddTokenToIndex(kDocumentId0, kSectionId2, - TermMatchType::EXACT_ONLY, "foo"), - IsOk()); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SuggestionProcessor> suggestion_processor, - SuggestionProcessor::Create(index_.get(), language_segmenter_.get(), - normalizer_.get())); - - SuggestionSpecProto suggestion_spec; - suggestion_spec.set_prefix("nonExistTerm"); - suggestion_spec.set_num_to_return(10); - - AlwaysTrueNamespaceCheckerImpl impl; - ICING_ASSERT_OK_AND_ASSIGN( - std::vector<TermMetadata> terms, - suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); - - EXPECT_THAT(terms, IsEmpty()); -} - -TEST_F(SuggestionProcessorTest, PrefixTrailingSpaceTest) { - ASSERT_THAT(AddTokenToIndex(kDocumentId0, kSectionId2, - TermMatchType::EXACT_ONLY, "foo"), - IsOk()); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SuggestionProcessor> suggestion_processor, - SuggestionProcessor::Create(index_.get(), language_segmenter_.get(), - normalizer_.get())); - - SuggestionSpecProto suggestion_spec; - suggestion_spec.set_prefix("f "); - suggestion_spec.set_num_to_return(10); - - AlwaysTrueNamespaceCheckerImpl impl; - ICING_ASSERT_OK_AND_ASSIGN( - std::vector<TermMetadata> terms, - suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); - - EXPECT_THAT(terms, IsEmpty()); -} - -TEST_F(SuggestionProcessorTest, NormalizePrefixTest) { - ASSERT_THAT(AddTokenToIndex(kDocumentId0, kSectionId2, - TermMatchType::EXACT_ONLY, "foo"), - IsOk()); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SuggestionProcessor> suggestion_processor, - SuggestionProcessor::Create(index_.get(), language_segmenter_.get(), - normalizer_.get())); - - SuggestionSpecProto suggestion_spec; - suggestion_spec.set_prefix("F"); - suggestion_spec.set_num_to_return(10); - - AlwaysTrueNamespaceCheckerImpl impl; - ICING_ASSERT_OK_AND_ASSIGN( - std::vector<TermMetadata> terms, - suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); - EXPECT_THAT(terms.at(0).content, "foo"); - - suggestion_spec.set_prefix("fO"); - ICING_ASSERT_OK_AND_ASSIGN( - terms, suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); - EXPECT_THAT(terms.at(0).content, "foo"); - - suggestion_spec.set_prefix("Fo"); - ICING_ASSERT_OK_AND_ASSIGN( - terms, suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); - EXPECT_THAT(terms.at(0).content, "foo"); - - suggestion_spec.set_prefix("FO"); - ICING_ASSERT_OK_AND_ASSIGN( - terms, suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); - EXPECT_THAT(terms.at(0).content, "foo"); -} - -TEST_F(SuggestionProcessorTest, OrOperatorPrefixTest) { - ASSERT_THAT(AddTokenToIndex(kDocumentId0, kSectionId2, - TermMatchType::EXACT_ONLY, "foo"), - IsOk()); - ASSERT_THAT(AddTokenToIndex(kDocumentId0, kSectionId2, - TermMatchType::EXACT_ONLY, "original"), - IsOk()); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SuggestionProcessor> suggestion_processor, - SuggestionProcessor::Create(index_.get(), language_segmenter_.get(), - normalizer_.get())); - - SuggestionSpecProto suggestion_spec; - suggestion_spec.set_prefix("f OR"); - suggestion_spec.set_num_to_return(10); - - AlwaysTrueNamespaceCheckerImpl impl; - ICING_ASSERT_OK_AND_ASSIGN( - std::vector<TermMetadata> terms, - suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); - - // Last Operator token will be used to query suggestion - EXPECT_THAT(terms.at(0).content, "f original"); -} - -TEST_F(SuggestionProcessorTest, ParenthesesOperatorPrefixTest) { - ASSERT_THAT(AddTokenToIndex(kDocumentId0, kSectionId2, - TermMatchType::EXACT_ONLY, "foo"), - IsOk()); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SuggestionProcessor> suggestion_processor, - SuggestionProcessor::Create(index_.get(), language_segmenter_.get(), - normalizer_.get())); - - SuggestionSpecProto suggestion_spec; - suggestion_spec.set_prefix("{f}"); - suggestion_spec.set_num_to_return(10); - - AlwaysTrueNamespaceCheckerImpl impl; - ICING_ASSERT_OK_AND_ASSIGN( - std::vector<TermMetadata> terms, - suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); - EXPECT_THAT(terms, IsEmpty()); - - suggestion_spec.set_prefix("[f]"); - ICING_ASSERT_OK_AND_ASSIGN( - terms, suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); - EXPECT_THAT(terms, IsEmpty()); - - suggestion_spec.set_prefix("(f)"); - ICING_ASSERT_OK_AND_ASSIGN( - terms, suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); - EXPECT_THAT(terms, IsEmpty()); -} - -TEST_F(SuggestionProcessorTest, OtherSpecialPrefixTest) { - ASSERT_THAT(AddTokenToIndex(kDocumentId0, kSectionId2, - TermMatchType::EXACT_ONLY, "foo"), - IsOk()); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SuggestionProcessor> suggestion_processor, - SuggestionProcessor::Create(index_.get(), language_segmenter_.get(), - normalizer_.get())); - - SuggestionSpecProto suggestion_spec; - suggestion_spec.set_prefix("f:"); - suggestion_spec.set_num_to_return(10); - - AlwaysTrueNamespaceCheckerImpl impl; - ICING_ASSERT_OK_AND_ASSIGN( - std::vector<TermMetadata> terms, - suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); - EXPECT_THAT(terms, IsEmpty()); - - suggestion_spec.set_prefix("f-"); - ICING_ASSERT_OK_AND_ASSIGN( - terms, suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); - EXPECT_THAT(terms, IsEmpty()); -} - -TEST_F(SuggestionProcessorTest, InvalidPrefixTest) { - ASSERT_THAT(AddTokenToIndex(kDocumentId0, kSectionId2, - TermMatchType::EXACT_ONLY, "original"), - IsOk()); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SuggestionProcessor> suggestion_processor, - SuggestionProcessor::Create(index_.get(), language_segmenter_.get(), - normalizer_.get())); - - SuggestionSpecProto suggestion_spec; - suggestion_spec.set_prefix("OR OR - :"); - suggestion_spec.set_num_to_return(10); - - AlwaysTrueNamespaceCheckerImpl impl; - ICING_ASSERT_OK_AND_ASSIGN( - std::vector<TermMetadata> terms, - suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); - EXPECT_THAT(terms, IsEmpty()); -} - -} // namespace - -} // namespace lib -} // namespace icing diff --git a/icing/result/result-retriever_test.cc b/icing/result/result-retriever_test.cc index 0d812e4..1c9684d 100644 --- a/icing/result/result-retriever_test.cc +++ b/icing/result/result-retriever_test.cc @@ -22,6 +22,7 @@ #include "gtest/gtest.h" #include "icing/document-builder.h" #include "icing/file/mock-filesystem.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/equals-proto.h" #include "icing/portable/platform.h" #include "icing/proto/document.pb.h" @@ -35,7 +36,6 @@ #include "icing/store/document-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" -#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/snippet-helpers.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" @@ -55,14 +55,14 @@ using ::testing::IsEmpty; using ::testing::Return; using ::testing::SizeIs; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = + PropertyConfigProto_Cardinality_Code_OPTIONAL; -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = + StringIndexingConfig_TokenizerType_Code_PLAIN; -constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; -constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; +constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; +constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; class ResultRetrieverTest : public testing::Test { protected: @@ -160,7 +160,7 @@ ResultSpecProto::SnippetSpecProto CreateSnippetSpec() { ResultSpecProto::SnippetSpecProto snippet_spec; snippet_spec.set_num_to_snippet(std::numeric_limits<int>::max()); snippet_spec.set_num_matches_per_property(std::numeric_limits<int>::max()); - snippet_spec.set_max_window_utf32_length(1024); + snippet_spec.set_max_window_bytes(1024); return snippet_spec; } @@ -362,8 +362,8 @@ TEST_F(ResultRetrieverTest, NotIgnoreErrors) { TEST_F(ResultRetrieverTest, IOErrorShouldReturnInternalError) { MockFilesystem mock_filesystem; - ON_CALL(mock_filesystem, PRead(A<int>(), A<void*>(), A<size_t>(), A<off_t>())) - .WillByDefault(Return(false)); + ON_CALL(mock_filesystem, OpenForRead(_)).WillByDefault(Return(false)); + ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, DocumentStore::Create(&mock_filesystem, test_dir_, &fake_clock_, diff --git a/icing/result/result-state-manager_test.cc b/icing/result/result-state-manager_test.cc index 8a9005d..32e45aa 100644 --- a/icing/result/result-state-manager_test.cc +++ b/icing/result/result-state-manager_test.cc @@ -849,7 +849,7 @@ TEST_F(ResultStateManagerTest, ShouldGetSnippetContext) { ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/1); result_spec.mutable_snippet_spec()->set_num_to_snippet(5); result_spec.mutable_snippet_spec()->set_num_matches_per_property(5); - result_spec.mutable_snippet_spec()->set_max_window_utf32_length(5); + result_spec.mutable_snippet_spec()->set_max_window_bytes(5); SearchSpecProto search_spec; search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); @@ -884,7 +884,7 @@ TEST_F(ResultStateManagerTest, ShouldGetDefaultSnippetContext) { // 0 indicates no snippeting result_spec.mutable_snippet_spec()->set_num_to_snippet(0); result_spec.mutable_snippet_spec()->set_num_matches_per_property(0); - result_spec.mutable_snippet_spec()->set_max_window_utf32_length(0); + result_spec.mutable_snippet_spec()->set_max_window_bytes(0); SearchSpecProto search_spec; search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); diff --git a/icing/result/result-state_test.cc b/icing/result/result-state_test.cc index d92fcfa..f2121a5 100644 --- a/icing/result/result-state_test.cc +++ b/icing/result/result-state_test.cc @@ -143,7 +143,7 @@ TEST_F(ResultStateTest, ShouldReturnSnippetContextAccordingToSpecs) { ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2); result_spec.mutable_snippet_spec()->set_num_to_snippet(5); result_spec.mutable_snippet_spec()->set_num_matches_per_property(5); - result_spec.mutable_snippet_spec()->set_max_window_utf32_length(5); + result_spec.mutable_snippet_spec()->set_max_window_bytes(5); SectionRestrictQueryTermsMap query_terms_map; query_terms_map.emplace("term1", std::unordered_set<std::string>()); @@ -178,7 +178,7 @@ TEST_F(ResultStateTest, NoSnippetingShouldReturnNull) { // stored. result_spec.mutable_snippet_spec()->set_num_to_snippet(0); result_spec.mutable_snippet_spec()->set_num_matches_per_property(5); - result_spec.mutable_snippet_spec()->set_max_window_utf32_length(5); + result_spec.mutable_snippet_spec()->set_max_window_bytes(5); SectionRestrictQueryTermsMap query_terms_map; query_terms_map.emplace("term1", std::unordered_set<std::string>()); diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc index bd1524e..2a138ec 100644 --- a/icing/result/snippet-retriever.cc +++ b/icing/result/snippet-retriever.cc @@ -41,7 +41,6 @@ #include "icing/transform/normalizer.h" #include "icing/util/character-iterator.h" #include "icing/util/i18n-utils.h" -#include "icing/util/logging.h" #include "icing/util/status-macros.h" namespace icing { @@ -76,81 +75,10 @@ inline std::string AddIndexToPath(int values_size, int index, kRBracket); } -// Returns a string of the normalized text of the input Token. Normalization -// is applied based on the Token's type. -std::string NormalizeToken(const Normalizer& normalizer, const Token& token) { - switch (token.type) { - case Token::Type::REGULAR: - return normalizer.NormalizeTerm(token.text); - case Token::Type::VERBATIM: - return std::string(token.text); - case Token::Type::QUERY_EXCLUSION: - [[fallthrough]]; - case Token::Type::QUERY_LEFT_PARENTHESES: - [[fallthrough]]; - case Token::Type::QUERY_RIGHT_PARENTHESES: - [[fallthrough]]; - case Token::Type::QUERY_OR: - [[fallthrough]]; - case Token::Type::QUERY_PROPERTY: - [[fallthrough]]; - case Token::Type::INVALID: - ICING_LOG(WARNING) << "Unable to normalize token of type: " - << static_cast<int>(token.type); - return std::string(token.text); - } -} - -// Returns a CharacterIterator for token's text, advancing one past the last -// matching character from the query term. -CharacterIterator FindMatchEnd(const Normalizer& normalizer, const Token& token, - const std::string& match_query_term) { - switch (token.type) { - case Token::Type::VERBATIM: { - // VERBATIM tokens are not normalized. This means the non-normalized - // matched query term must be either equal to or a prefix of the token's - // text. Therefore, the match must end at the end of the matched query - // term. - CharacterIterator verbatim_match_end = - CharacterIterator(token.text, 0, 0, 0); - verbatim_match_end.AdvanceToUtf8(match_query_term.length()); - return verbatim_match_end; - } - case Token::Type::QUERY_EXCLUSION: - [[fallthrough]]; - case Token::Type::QUERY_LEFT_PARENTHESES: - [[fallthrough]]; - case Token::Type::QUERY_RIGHT_PARENTHESES: - [[fallthrough]]; - case Token::Type::QUERY_OR: - [[fallthrough]]; - case Token::Type::QUERY_PROPERTY: - [[fallthrough]]; - case Token::Type::INVALID: - ICING_LOG(WARNING) - << "Unexpected Token type " << static_cast<int>(token.type) - << " found when finding match end of query term and token."; - [[fallthrough]]; - case Token::Type::REGULAR: - return normalizer.FindNormalizedMatchEndPosition(token.text, - match_query_term); - } -} - class TokenMatcher { public: virtual ~TokenMatcher() = default; - - // Returns a CharacterIterator pointing just past the end of the substring in - // token.text that matches a query term. Note that the utf* indices will be - // in relation to token.text's start. - // - // If there is no match, then it will construct a CharacterIterator with all - // of its indices set to -1. - // - // Ex. With an exact matcher, query terms=["foo","bar"] and token.text="bar", - // Matches will return a CharacterIterator(u8:3, u16:3, u32:3). - virtual CharacterIterator Matches(Token token) const = 0; + virtual bool Matches(Token token) const = 0; }; class TokenMatcherExact : public TokenMatcher { @@ -163,18 +91,10 @@ class TokenMatcherExact : public TokenMatcher { restricted_query_terms_(restricted_query_terms), normalizer_(normalizer) {} - CharacterIterator Matches(Token token) const override { - std::string s = NormalizeToken(normalizer_, token); - auto itr = unrestricted_query_terms_.find(s); - if (itr == unrestricted_query_terms_.end()) { - itr = restricted_query_terms_.find(s); - } - if (itr != unrestricted_query_terms_.end() && - itr != restricted_query_terms_.end()) { - return FindMatchEnd(normalizer_, token, *itr); - } - - return CharacterIterator(token.text, -1, -1, -1); + bool Matches(Token token) const override { + std::string s = normalizer_.NormalizeTerm(token.text); + return (unrestricted_query_terms_.count(s) > 0) || + (restricted_query_terms_.count(s) > 0); } private: @@ -193,21 +113,22 @@ class TokenMatcherPrefix : public TokenMatcher { restricted_query_terms_(restricted_query_terms), normalizer_(normalizer) {} - CharacterIterator Matches(Token token) const override { - std::string s = NormalizeToken(normalizer_, token); - for (const std::string& query_term : unrestricted_query_terms_) { - if (query_term.length() <= s.length() && - s.compare(0, query_term.length(), query_term) == 0) { - return FindMatchEnd(normalizer_, token, query_term); - } - } - for (const std::string& query_term : restricted_query_terms_) { - if (query_term.length() <= s.length() && - s.compare(0, query_term.length(), query_term) == 0) { - return FindMatchEnd(normalizer_, token, query_term); - } + bool Matches(Token token) const override { + std::string s = normalizer_.NormalizeTerm(token.text); + if (std::any_of(unrestricted_query_terms_.begin(), + unrestricted_query_terms_.end(), + [&s](const std::string& term) { + return term.length() <= s.length() && + s.compare(0, term.length(), term) == 0; + })) { + return true; } - return CharacterIterator(token.text, -1, -1, -1); + return std::any_of(restricted_query_terms_.begin(), + restricted_query_terms_.end(), + [&s](const std::string& term) { + return term.length() <= s.length() && + s.compare(0, term.length(), term) == 0; + }); } private: @@ -245,7 +166,7 @@ libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowStart( const ResultSpecProto::SnippetSpecProto& snippet_spec, std::string_view value, int window_start_min_exclusive_utf32, Tokenizer::Iterator* iterator) { - if (!iterator->ResetToTokenStartingAfter(window_start_min_exclusive_utf32)) { + if (!iterator->ResetToTokenAfter(window_start_min_exclusive_utf32)) { return absl_ports::InternalError( "Couldn't reset tokenizer to determine snippet window!"); } @@ -280,7 +201,7 @@ libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowEnd( const ResultSpecProto::SnippetSpecProto& snippet_spec, std::string_view value, int window_end_max_exclusive_utf32, Tokenizer::Iterator* iterator) { - if (!iterator->ResetToTokenEndingBefore(window_end_max_exclusive_utf32)) { + if (!iterator->ResetToTokenBefore(window_end_max_exclusive_utf32)) { return absl_ports::InternalError( "Couldn't reset tokenizer to determine snippet window!"); } @@ -344,9 +265,9 @@ libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch( int match_len_utf32 = end_itr.utf32_index() - match_pos_utf32; int match_mid_utf32 = match_pos_utf32 + match_len_utf32 / 2; int window_start_min_exclusive_utf32 = - (match_mid_utf32 - snippet_spec.max_window_utf32_length() / 2) - 1; + (match_mid_utf32 - snippet_spec.max_window_bytes() / 2) - 1; int window_end_max_exclusive_utf32 = - match_mid_utf32 + (snippet_spec.max_window_utf32_length() + 1) / 2; + match_mid_utf32 + (snippet_spec.max_window_bytes() + 1) / 2; snippet_match.set_exact_match_byte_position(start_itr.utf8_index()); snippet_match.set_exact_match_utf16_position(start_itr.utf16_index()); @@ -357,7 +278,7 @@ libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch( // Only include windows if it'll at least include the matched text. Otherwise, // it'll just be an empty string anyways. - if (snippet_spec.max_window_utf32_length() >= match_len_utf32) { + if (snippet_spec.max_window_bytes() >= match_len_utf32) { // Find the beginning of the window. ICING_ASSIGN_OR_RETURN( CharacterIterator window_start, @@ -398,13 +319,8 @@ libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch( // DetermineWindowStart/End may change the position of the iterator. So, // reset the iterator back to the original position. - bool success = false; - if (match_pos_utf32 > 0) { - success = iterator->ResetToTokenStartingAfter(match_pos_utf32 - 1); - } else { - success = iterator->ResetToStart(); - } - + bool success = (match_pos_utf32 > 0) ? iterator->ResetToTokenAfter(match_pos_utf32 - 1) + : iterator->ResetToStart(); if (!success) { return absl_ports::InternalError( "Couldn't reset tokenizer to determine snippet window!"); @@ -448,10 +364,7 @@ void GetEntriesFromProperty(const PropertyProto* current_property, CharacterIterator char_iterator(value); while (iterator->Advance()) { Token token = iterator->GetToken(); - CharacterIterator submatch_end = matcher->Matches(token); - // If the token matched a query term, then submatch_end will point to an - // actual position within token.text. - if (submatch_end.utf8_index() != -1) { + if (matcher->Matches(token)) { if (!char_iterator.AdvanceToUtf8(token.text.data() - value.data())) { // We can't get the char_iterator to a valid position, so there's no // way for us to provide valid utf-16 indices. There's nothing more we @@ -480,15 +393,7 @@ void GetEntriesFromProperty(const PropertyProto* current_property, } } SnippetMatchProto match = std::move(match_or).ValueOrDie(); - // submatch_end refers to a position *within* token.text. - // This, conveniently enough, means that index that submatch_end points - // to is the length of the submatch (because the submatch starts at 0 in - // token.text). - match.set_submatch_byte_length(submatch_end.utf8_index()); - match.set_submatch_utf16_length(submatch_end.utf16_index()); - // Add the values for the submatch. snippet_entry.mutable_snippet_matches()->Add(std::move(match)); - if (--match_options->max_matches_remaining <= 0) { *snippet_proto->add_entries() = std::move(snippet_entry); return; diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc index 0de2295..e7988ae 100644 --- a/icing/result/snippet-retriever_test.cc +++ b/icing/result/snippet-retriever_test.cc @@ -22,6 +22,7 @@ #include "gtest/gtest.h" #include "icing/document-builder.h" #include "icing/file/mock-filesystem.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/equals-proto.h" #include "icing/portable/platform.h" #include "icing/proto/document.pb.h" @@ -36,14 +37,12 @@ #include "icing/store/key-mapper.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" -#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/jni-test-helpers.h" #include "icing/testing/snippet-helpers.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" #include "icing/tokenization/language-segmenter-factory.h" #include "icing/tokenization/language-segmenter.h" -#include "icing/transform/map/map-normalizer.h" #include "icing/transform/normalizer-factory.h" #include "icing/transform/normalizer.h" #include "unicode/uloc.h" @@ -58,18 +57,16 @@ using ::testing::Eq; using ::testing::IsEmpty; using ::testing::SizeIs; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED = - PropertyConfigProto::Cardinality::REPEATED; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = + PropertyConfigProto_Cardinality_Code_OPTIONAL; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED = + PropertyConfigProto_Cardinality_Code_REPEATED; -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_VERBATIM = - StringIndexingConfig::TokenizerType::VERBATIM; +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = + StringIndexingConfig_TokenizerType_Code_PLAIN; -constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; -constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; +constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; +constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; std::vector<std::string_view> GetPropertyPaths(const SnippetProto& snippet) { std::vector<std::string_view> paths; @@ -133,7 +130,7 @@ class SnippetRetrieverTest : public testing::Test { snippet_spec_.set_num_to_snippet(std::numeric_limits<int32_t>::max()); snippet_spec_.set_num_matches_per_property( std::numeric_limits<int32_t>::max()); - snippet_spec_.set_max_window_utf32_length(64); + snippet_spec_.set_max_window_bytes(64); } void TearDown() override { @@ -180,7 +177,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeSmallerThanMatch) { // Window starts at the beginning of "three" and ends in the middle of // "three". len=4, orig_window= "thre" - snippet_spec_.set_max_window_utf32_length(4); + snippet_spec_.set_max_window_bytes(4); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -206,7 +203,7 @@ TEST_F(SnippetRetrieverTest, // Window starts at the beginning of "three" and at the exact end of // "three". len=5, orig_window= "three" - snippet_spec_.set_max_window_utf32_length(5); + snippet_spec_.set_max_window_bytes(5); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -232,7 +229,7 @@ TEST_F(SnippetRetrieverTest, // Window starts at the beginning of "four" and at the exact end of // "four". len=4, orig_window= "four" - snippet_spec_.set_max_window_utf32_length(4); + snippet_spec_.set_max_window_bytes(4); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -264,7 +261,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsInWhitespace) { // 1. untrimmed, no-shifting window will be (2,17). // 2. trimmed, no-shifting window [4,13) "two three" // 3. trimmed, shifted window [4,18) "two three four" - snippet_spec_.set_max_window_utf32_length(14); + snippet_spec_.set_max_window_bytes(14); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -297,7 +294,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsMidToken) { // 1. untrimmed, no-shifting window will be (1,18). // 2. trimmed, no-shifting window [4,18) "two three four" // 3. trimmed, shifted window [4,20) "two three four.." - snippet_spec_.set_max_window_utf32_length(16); + snippet_spec_.set_max_window_bytes(16); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -323,7 +320,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInPunctuation) { // Window ends in the middle of all the punctuation and window starts at 0. // len=20, orig_window="one two three four.." - snippet_spec_.set_max_window_utf32_length(20); + snippet_spec_.set_max_window_bytes(20); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -351,7 +348,7 @@ TEST_F(SnippetRetrieverTest, // Window ends in the middle of all the punctuation and window starts at 0. // len=26, orig_window="pside down in Australia¿" - snippet_spec_.set_max_window_utf32_length(24); + snippet_spec_.set_max_window_bytes(24); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -379,7 +376,7 @@ TEST_F(SnippetRetrieverTest, // Window ends in the middle of all the punctuation and window starts at 0. // len=26, orig_window="upside down in Australia¿ " - snippet_spec_.set_max_window_utf32_length(26); + snippet_spec_.set_max_window_bytes(26); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -412,7 +409,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsBeforeValueStart) { // 1. untrimmed, no-shifting window will be (-2,21). // 2. trimmed, no-shifting window [0,21) "one two three four..." // 3. trimmed, shifted window [0,22) "one two three four...." - snippet_spec_.set_max_window_utf32_length(22); + snippet_spec_.set_max_window_bytes(22); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -438,7 +435,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInWhitespace) { // Window ends before "five" but after all the punctuation // len=26, orig_window="one two three four.... " - snippet_spec_.set_max_window_utf32_length(26); + snippet_spec_.set_max_window_bytes(26); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -471,7 +468,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsMidToken) { // 1. untrimmed, no-shifting window will be ((-7,26). // 2. trimmed, no-shifting window [0,26) "one two three four...." // 3. trimmed, shifted window [0,27) "one two three four.... five" - snippet_spec_.set_max_window_utf32_length(32); + snippet_spec_.set_max_window_bytes(32); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -497,7 +494,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeEqualToValueSize) { // Max window size equals the size of the value. // len=34, orig_window="one two three four.... five" - snippet_spec_.set_max_window_utf32_length(34); + snippet_spec_.set_max_window_bytes(34); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -523,7 +520,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeLargerThanValueSize) { // Max window size exceeds the size of the value. // len=36, orig_window="one two three four.... five" - snippet_spec_.set_max_window_utf32_length(36); + snippet_spec_.set_max_window_bytes(36); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -557,7 +554,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStart) { // 1. untrimmed, no-shifting window will be (-10,19). // 2. trimmed, no-shifting window [0,19) "one two three four." // 3. trimmed, shifted window [0,27) "one two three four.... five" - snippet_spec_.set_max_window_utf32_length(28); + snippet_spec_.set_max_window_bytes(28); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -591,7 +588,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEnd) { // 1. untrimmed, no-shifting window will be (10,39). // 2. trimmed, no-shifting window [14,31) "four.... five six" // 3. trimmed, shifted window [4,31) "two three four.... five six" - snippet_spec_.set_max_window_utf32_length(28); + snippet_spec_.set_max_window_bytes(28); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -625,7 +622,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStartShortText) { // 1. untrimmed, no-shifting window will be (-10,19). // 2. trimmed, no-shifting window [0, 19) "one two three four." // 3. trimmed, shifted window [0, 22) "one two three four...." - snippet_spec_.set_max_window_utf32_length(28); + snippet_spec_.set_max_window_bytes(28); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -659,7 +656,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEndShortText) { // 1. untrimmed, no-shifting window will be (1,30). // 2. trimmed, no-shifting window [4, 22) "two three four...." // 3. trimmed, shifted window [0, 22) "one two three four...." - snippet_spec_.set_max_window_utf32_length(28); + snippet_spec_.set_max_window_bytes(28); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -693,7 +690,6 @@ TEST_F(SnippetRetrieverTest, PrefixSnippeting) { EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("subject foo")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("f")); } TEST_F(SnippetRetrieverTest, ExactSnippeting) { @@ -723,7 +719,7 @@ TEST_F(SnippetRetrieverTest, SimpleSnippetingNoWindowing) { .AddStringProperty("body", "Only a fool would match this content.") .Build(); - snippet_spec_.set_max_window_utf32_length(0); + snippet_spec_.set_max_window_bytes(0); SectionIdMask section_mask = 0b00000011; SectionRestrictQueryTermsMap query_terms{{"", {"foo"}}}; @@ -737,7 +733,6 @@ TEST_F(SnippetRetrieverTest, SimpleSnippetingNoWindowing) { GetString(&document, snippet.entries(0).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo")); } TEST_F(SnippetRetrieverTest, SnippetingMultipleMatches) { @@ -784,15 +779,12 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatches) { "we need to begin considering our options regarding body bar.")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo", "bar")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), - ElementsAre("foo", "bar")); EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject")); content = GetString(&document, snippet.entries(1).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("subject foo")); EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("foo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo")); } TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrict) { @@ -842,8 +834,6 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrict) { "we need to begin considering our options regarding body bar.")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo", "bar")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), - ElementsAre("foo", "bar")); } TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrictedTerm) { @@ -894,16 +884,12 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrictedTerm) { "Concerning the subject of foo, we need to begin considering our")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("subject", "foo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), - ElementsAre("subject", "foo")); EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject")); content = GetString(&document, snippet.entries(1).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("subject foo")); EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("subject")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), - ElementsAre("subject")); } TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesOneMatchPerProperty) { @@ -947,14 +933,12 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesOneMatchPerProperty) { ElementsAre( "Concerning the subject of foo, we need to begin considering our")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo")); EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject")); content = GetString(&document, snippet.entries(1).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("subject foo")); EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("foo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo")); } TEST_F(SnippetRetrieverTest, PrefixSnippetingNormalization) { @@ -976,7 +960,6 @@ TEST_F(SnippetRetrieverTest, PrefixSnippetingNormalization) { GetString(&document, snippet.entries(0).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("MDI team")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("MDI")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("MD")); } TEST_F(SnippetRetrieverTest, ExactSnippetingNormalization) { @@ -1000,9 +983,6 @@ TEST_F(SnippetRetrieverTest, ExactSnippetingNormalization) { EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("Some members are in Zürich.")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("Zürich")); - - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), - ElementsAre("Zürich")); } TEST_F(SnippetRetrieverTest, SnippetingTestOneLevel) { @@ -1063,13 +1043,11 @@ TEST_F(SnippetRetrieverTest, SnippetingTestOneLevel) { GetString(&document, snippet.entries(0).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo")); EXPECT_THAT(snippet.entries(1).property_name(), Eq("X[3]")); content = GetString(&document, snippet.entries(1).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo")); EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo")); EXPECT_THAT(GetPropertyPaths(snippet), ElementsAre("X[1]", "X[3]", "Y[1]", "Y[3]", "Z[1]", "Z[3]")); @@ -1166,13 +1144,11 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevel) { GetString(&document, snippet.entries(0).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo")); EXPECT_THAT(snippet.entries(1).property_name(), Eq("A.X[3]")); content = GetString(&document, snippet.entries(1).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo")); EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo")); EXPECT_THAT( GetPropertyPaths(snippet), @@ -1275,13 +1251,11 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelRepeated) { GetString(&document, snippet.entries(0).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo")); EXPECT_THAT(snippet.entries(1).property_name(), Eq("A[0].X[3]")); content = GetString(&document, snippet.entries(1).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo")); EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo")); EXPECT_THAT(GetPropertyPaths(snippet), ElementsAre("A[0].X[1]", "A[0].X[3]", "A[1].X[1]", "A[1].X[3]", @@ -1382,13 +1356,11 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelSingleValue) { GetString(&document, snippet.entries(0).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo")); EXPECT_THAT(snippet.entries(1).property_name(), Eq("A[1].X")); content = GetString(&document, snippet.entries(1).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo")); EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo")); - EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo")); EXPECT_THAT( GetPropertyPaths(snippet), @@ -1432,12 +1404,10 @@ TEST_F(SnippetRetrieverTest, CJKSnippetMatchTest) { // Ensure that the match is correct. EXPECT_THAT(GetMatches(content, *entry), ElementsAre("走路")); - EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("走")); // Ensure that the utf-16 values are also as expected EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(3)); EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2)); - EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(1)); } TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) { @@ -1475,7 +1445,7 @@ TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) { // 1. untrimmed, no-shifting window will be (0,7). // 2. trimmed, no-shifting window [1, 6) "每天走路去". // 3. trimmed, shifted window [0, 6) "我每天走路去" - snippet_spec_.set_max_window_utf32_length(6); + snippet_spec_.set_max_window_bytes(6); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); @@ -1537,12 +1507,10 @@ TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitSnippetMatchTest) { // Ensure that the match is correct. EXPECT_THAT(GetMatches(content, *entry), ElementsAre("𐀂𐀃")); - EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("𐀂")); // Ensure that the utf-16 values are also as expected EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(5)); EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(4)); - EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2)); } TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) { @@ -1574,7 +1542,7 @@ TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) { // UTF8 idx: 9 22 // UTF16 idx: 5 12 // UTF32 idx: 3 7 - snippet_spec_.set_max_window_utf32_length(6); + snippet_spec_.set_max_window_bytes(6); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); @@ -1598,117 +1566,6 @@ TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) { EXPECT_THAT(match_proto.window_utf16_length(), Eq(7)); } -TEST_F(SnippetRetrieverTest, SnippettingVerbatimAscii) { - SchemaProto schema = - SchemaBuilder() - .AddType(SchemaTypeConfigBuilder() - .SetType("verbatimType") - .AddProperty(PropertyConfigBuilder() - .SetName("verbatim") - .SetDataTypeString(MATCH_EXACT, - TOKENIZER_VERBATIM) - .SetCardinality(CARDINALITY_REPEATED))) - .Build(); - ICING_ASSERT_OK(schema_store_->SetSchema( - schema, /*ignore_errors_and_delete_documents=*/true)); - ICING_ASSERT_OK_AND_ASSIGN( - snippet_retriever_, - SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(), - normalizer_.get())); - - DocumentProto document = DocumentBuilder() - .SetKey("icing", "verbatim/1") - .SetSchema("verbatimType") - .AddStringProperty("verbatim", "Hello, world!") - .Build(); - - SectionIdMask section_mask = 0b00000001; - SectionRestrictQueryTermsMap query_terms{{"", {"Hello, world!"}}}; - - snippet_spec_.set_max_window_utf32_length(13); - SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); - - // There should only be one snippet entry and match, the verbatim token in its - // entirety. - ASSERT_THAT(snippet.entries(), SizeIs(1)); - - const SnippetProto::EntryProto* entry = &snippet.entries(0); - ASSERT_THAT(entry->snippet_matches(), SizeIs(1)); - ASSERT_THAT(entry->property_name(), "verbatim"); - - const SnippetMatchProto& match_proto = entry->snippet_matches(0); - // We expect the match to begin at position 0, and to span the entire token - // which contains 13 characters. - EXPECT_THAT(match_proto.window_byte_position(), Eq(0)); - EXPECT_THAT(match_proto.window_utf16_length(), Eq(13)); - - // We expect the submatch to begin at position 0 of the verbatim token and - // span the length of our query term "Hello, world!", which has utf-16 length - // of 13. The submatch length is equal to the window length as the query the - // snippet is retrieved with an exact term match. - EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(0)); - EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(13)); -} - -TEST_F(SnippetRetrieverTest, SnippettingVerbatimCJK) { - SchemaProto schema = - SchemaBuilder() - .AddType(SchemaTypeConfigBuilder() - .SetType("verbatimType") - .AddProperty(PropertyConfigBuilder() - .SetName("verbatim") - .SetDataTypeString(MATCH_PREFIX, - TOKENIZER_VERBATIM) - .SetCardinality(CARDINALITY_REPEATED))) - .Build(); - ICING_ASSERT_OK(schema_store_->SetSchema( - schema, /*ignore_errors_and_delete_documents=*/true)); - ICING_ASSERT_OK_AND_ASSIGN( - snippet_retriever_, - SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(), - normalizer_.get())); - - // String: "我每天走路去上班。" - // ^ ^ ^ ^^ - // UTF8 idx: 0 3 9 15 18 - // UTF16 idx: 0 1 3 5 6 - // UTF32 idx: 0 1 3 5 6 - // Breaks into segments: "我", "每天", "走路", "去", "上班" - std::string chinese_string = "我每天走路去上班。"; - DocumentProto document = DocumentBuilder() - .SetKey("icing", "verbatim/1") - .SetSchema("verbatimType") - .AddStringProperty("verbatim", chinese_string) - .Build(); - - SectionIdMask section_mask = 0b00000001; - SectionRestrictQueryTermsMap query_terms{{"", {"我每"}}}; - - snippet_spec_.set_max_window_utf32_length(9); - SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); - - // There should only be one snippet entry and match, the verbatim token in its - // entirety. - ASSERT_THAT(snippet.entries(), SizeIs(1)); - - const SnippetProto::EntryProto* entry = &snippet.entries(0); - ASSERT_THAT(entry->snippet_matches(), SizeIs(1)); - ASSERT_THAT(entry->property_name(), "verbatim"); - - const SnippetMatchProto& match_proto = entry->snippet_matches(0); - // We expect the match to begin at position 0, and to span the entire token - // which has utf-16 length of 9. - EXPECT_THAT(match_proto.window_byte_position(), Eq(0)); - EXPECT_THAT(match_proto.window_utf16_length(), Eq(9)); - - // We expect the submatch to begin at position 0 of the verbatim token and - // span the length of our query term "我每", which has utf-16 length of 2. - EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(0)); - EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2)); -} - } // namespace } // namespace lib diff --git a/icing/schema/schema-store.cc b/icing/schema/schema-store.cc index fc50ea6..e9ba654 100644 --- a/icing/schema/schema-store.cc +++ b/icing/schema/schema-store.cc @@ -108,60 +108,27 @@ libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create( ICING_RETURN_ERROR_IF_NULL(filesystem); ICING_RETURN_ERROR_IF_NULL(clock); - if (!filesystem->DirectoryExists(base_dir.c_str())) { - return absl_ports::FailedPreconditionError( - "Schema store base directory does not exist!"); - } std::unique_ptr<SchemaStore> schema_store = std::unique_ptr<SchemaStore>( new SchemaStore(filesystem, base_dir, clock)); ICING_RETURN_IF_ERROR(schema_store->Initialize(initialize_stats)); return schema_store; } -libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create( - const Filesystem* filesystem, const std::string& base_dir, - const Clock* clock, SchemaProto schema) { - ICING_RETURN_ERROR_IF_NULL(filesystem); - ICING_RETURN_ERROR_IF_NULL(clock); - - if (!filesystem->DirectoryExists(base_dir.c_str())) { - return absl_ports::FailedPreconditionError( - "Schema store base directory does not exist!"); - } - std::unique_ptr<SchemaStore> schema_store = std::unique_ptr<SchemaStore>( - new SchemaStore(filesystem, base_dir, clock)); - ICING_RETURN_IF_ERROR(schema_store->Initialize(std::move(schema))); - return schema_store; -} - SchemaStore::SchemaStore(const Filesystem* filesystem, std::string base_dir, const Clock* clock) - : filesystem_(filesystem), + : filesystem_(*filesystem), base_dir_(std::move(base_dir)), - clock_(clock), - schema_file_(std::make_unique<FileBackedProto<SchemaProto>>( - *filesystem, MakeSchemaFilename(base_dir_))) {} + clock_(*clock), + schema_file_(*filesystem, MakeSchemaFilename(base_dir_)) {} SchemaStore::~SchemaStore() { - if (has_schema_successfully_set_ && schema_file_ != nullptr && - schema_type_mapper_ != nullptr && section_manager_ != nullptr) { + if (has_schema_successfully_set_) { if (!PersistToDisk().ok()) { ICING_LOG(ERROR) << "Error persisting to disk in SchemaStore destructor"; } } } -libtextclassifier3::Status SchemaStore::Initialize(SchemaProto new_schema) { - if (!absl_ports::IsNotFound(GetSchema().status())) { - return absl_ports::FailedPreconditionError( - "Incorrectly tried to initialize schema store with a new schema, when " - "one is already set!"); - } - ICING_RETURN_IF_ERROR(schema_file_->Write( - std::make_unique<SchemaProto>(std::move(new_schema)))); - return InitializeInternal(/*initialize_stats=*/nullptr); -} - libtextclassifier3::Status SchemaStore::Initialize( InitializeStatsProto* initialize_stats) { auto schema_proto_or = GetSchema(); @@ -172,16 +139,13 @@ libtextclassifier3::Status SchemaStore::Initialize( // Real error when trying to read the existing schema return schema_proto_or.status(); } - return InitializeInternal(initialize_stats); -} + has_schema_successfully_set_ = true; -libtextclassifier3::Status SchemaStore::InitializeInternal( - InitializeStatsProto* initialize_stats) { if (!InitializeDerivedFiles().ok()) { ICING_VLOG(3) << "Couldn't find derived files or failed to initialize them, " "regenerating derived files for SchemaStore."; - std::unique_ptr<Timer> regenerate_timer = clock_->GetNewTimer(); + std::unique_ptr<Timer> regenerate_timer = clock_.GetNewTimer(); if (initialize_stats != nullptr) { initialize_stats->set_schema_store_recovery_cause( InitializeStatsProto::IO_ERROR); @@ -197,7 +161,6 @@ libtextclassifier3::Status SchemaStore::InitializeInternal( initialize_stats->set_num_schema_types(type_config_map_.size()); } - has_schema_successfully_set_ = true; return libtextclassifier3::Status::OK; } @@ -209,8 +172,8 @@ libtextclassifier3::Status SchemaStore::InitializeDerivedFiles() { } SchemaStore::Header header; - if (!filesystem_->Read(MakeHeaderFilename(base_dir_).c_str(), &header, - sizeof(header))) { + if (!filesystem_.Read(MakeHeaderFilename(base_dir_).c_str(), &header, + sizeof(header))) { return absl_ports::InternalError( absl_ports::StrCat("Couldn't read: ", MakeHeaderFilename(base_dir_))); } @@ -222,7 +185,7 @@ libtextclassifier3::Status SchemaStore::InitializeDerivedFiles() { ICING_ASSIGN_OR_RETURN( schema_type_mapper_, - KeyMapper<SchemaTypeId>::Create(*filesystem_, + KeyMapper<SchemaTypeId>::Create(filesystem_, MakeSchemaTypeMapperFilename(base_dir_), kSchemaTypeMapperMaxSize)); @@ -273,12 +236,12 @@ libtextclassifier3::Status SchemaStore::RegenerateDerivedFiles() { } bool SchemaStore::HeaderExists() { - if (!filesystem_->FileExists(MakeHeaderFilename(base_dir_).c_str())) { + if (!filesystem_.FileExists(MakeHeaderFilename(base_dir_).c_str())) { return false; } int64_t file_size = - filesystem_->GetFileSize(MakeHeaderFilename(base_dir_).c_str()); + filesystem_.GetFileSize(MakeHeaderFilename(base_dir_).c_str()); // If it's been truncated to size 0 before, we consider it to be a new file return file_size != 0 && file_size != Filesystem::kBadFileSize; @@ -291,11 +254,11 @@ libtextclassifier3::Status SchemaStore::UpdateHeader(const Crc32& checksum) { header.checksum = checksum.Get(); ScopedFd scoped_fd( - filesystem_->OpenForWrite(MakeHeaderFilename(base_dir_).c_str())); + filesystem_.OpenForWrite(MakeHeaderFilename(base_dir_).c_str())); // This should overwrite the header. if (!scoped_fd.is_valid() || - !filesystem_->Write(scoped_fd.get(), &header, sizeof(header)) || - !filesystem_->DataSync(scoped_fd.get())) { + !filesystem_.Write(scoped_fd.get(), &header, sizeof(header)) || + !filesystem_.DataSync(scoped_fd.get())) { return absl_ports::InternalError(absl_ports::StrCat( "Failed to write SchemaStore header: ", MakeHeaderFilename(base_dir_))); } @@ -305,10 +268,10 @@ libtextclassifier3::Status SchemaStore::UpdateHeader(const Crc32& checksum) { libtextclassifier3::Status SchemaStore::ResetSchemaTypeMapper() { // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset(). schema_type_mapper_.reset(); - // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = KeyMapper<SchemaTypeId>::Delete( - *filesystem_, MakeSchemaTypeMapperFilename(base_dir_)); + filesystem_, MakeSchemaTypeMapperFilename(base_dir_)); if (!status.ok()) { ICING_LOG(ERROR) << status.error_message() << "Failed to delete old schema_type mapper"; @@ -316,7 +279,7 @@ libtextclassifier3::Status SchemaStore::ResetSchemaTypeMapper() { } ICING_ASSIGN_OR_RETURN( schema_type_mapper_, - KeyMapper<SchemaTypeId>::Create(*filesystem_, + KeyMapper<SchemaTypeId>::Create(filesystem_, MakeSchemaTypeMapperFilename(base_dir_), kSchemaTypeMapperMaxSize)); @@ -324,17 +287,17 @@ libtextclassifier3::Status SchemaStore::ResetSchemaTypeMapper() { } libtextclassifier3::StatusOr<Crc32> SchemaStore::ComputeChecksum() const { - auto schema_proto_or = GetSchema(); - if (absl_ports::IsNotFound(schema_proto_or.status())) { - return Crc32(); + Crc32 total_checksum; + if (!has_schema_successfully_set_) { + // Nothing to checksum + return total_checksum; } - ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, schema_proto_or); + ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema()); Crc32 schema_checksum; schema_checksum.Append(schema_proto->SerializeAsString()); Crc32 schema_type_mapper_checksum = schema_type_mapper_->ComputeChecksum(); - Crc32 total_checksum; total_checksum.Append(std::to_string(schema_checksum.Get())); total_checksum.Append(std::to_string(schema_type_mapper_checksum.Get())); @@ -343,7 +306,7 @@ libtextclassifier3::StatusOr<Crc32> SchemaStore::ComputeChecksum() const { libtextclassifier3::StatusOr<const SchemaProto*> SchemaStore::GetSchema() const { - return schema_file_->Read(); + return schema_file_.Read(); } // TODO(cassiewang): Consider removing this definition of SetSchema if it's not @@ -368,9 +331,6 @@ SchemaStore::SetSchema(SchemaProto&& new_schema, if (absl_ports::IsNotFound(schema_proto_or.status())) { // We don't have a pre-existing schema, so anything is valid. result.success = true; - for (const SchemaTypeConfigProto& type_config : new_schema.types()) { - result.schema_types_new_by_name.insert(type_config.schema_type()); - } } else if (!schema_proto_or.ok()) { // Real error return schema_proto_or.status(); @@ -391,11 +351,8 @@ SchemaStore::SetSchema(SchemaProto&& new_schema, SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema, new_dependency_map); - result.schema_types_new_by_name = std::move(schema_delta.schema_types_new); - result.schema_types_changed_fully_compatible_by_name = - std::move(schema_delta.schema_types_changed_fully_compatible); - result.schema_types_index_incompatible_by_name = - std::move(schema_delta.schema_types_index_incompatible); + // An incompatible index is fine, we can just reindex + result.index_incompatible = schema_delta.index_incompatible; for (const auto& schema_type : schema_delta.schema_types_deleted) { // We currently don't support deletions, so mark this as not possible. @@ -430,78 +387,15 @@ SchemaStore::SetSchema(SchemaProto&& new_schema, result.success = result.success || ignore_errors_and_delete_documents; if (result.success) { - ICING_RETURN_IF_ERROR(ApplySchemaChange(std::move(new_schema))); + // Write the schema (and potentially overwrite a previous schema) + ICING_RETURN_IF_ERROR( + schema_file_.Write(std::make_unique<SchemaProto>(new_schema))); has_schema_successfully_set_ = true; - } - return result; -} - -libtextclassifier3::Status SchemaStore::ApplySchemaChange( - SchemaProto new_schema) { - // We need to ensure that we either 1) successfully set the schema and - // update all derived data structures or 2) fail and leave the schema store - // unchanged. - // So, first, we create an empty temporary directory to build a new schema - // store in. - std::string temp_schema_store_dir_path = base_dir_ + "_temp"; - if (!filesystem_->DeleteDirectoryRecursively( - temp_schema_store_dir_path.c_str())) { - ICING_LOG(WARNING) << "Failed to recursively delete " - << temp_schema_store_dir_path.c_str(); - return absl_ports::InternalError( - "Unable to delete temp directory to prepare to build new schema " - "store."); - } - - if (!filesystem_->CreateDirectoryRecursively( - temp_schema_store_dir_path.c_str())) { - return absl_ports::InternalError( - "Unable to create temp directory to build new schema store."); - } - - // Then we create our new schema store with the new schema. - auto new_schema_store_or = - SchemaStore::Create(filesystem_, temp_schema_store_dir_path, clock_, - std::move(new_schema)); - if (!new_schema_store_or.ok()) { - // Attempt to clean up the temp directory. - if (!filesystem_->DeleteDirectoryRecursively( - temp_schema_store_dir_path.c_str())) { - // Nothing to do here. Just log an error. - ICING_LOG(WARNING) << "Failed to recursively delete " - << temp_schema_store_dir_path.c_str(); - } - return new_schema_store_or.status(); - } - std::unique_ptr<SchemaStore> new_schema_store = - std::move(new_schema_store_or).ValueOrDie(); - - // Then we swap the new schema file + new derived files with the old files. - if (!filesystem_->SwapFiles(base_dir_.c_str(), - temp_schema_store_dir_path.c_str())) { - // Attempt to clean up the temp directory. - if (!filesystem_->DeleteDirectoryRecursively( - temp_schema_store_dir_path.c_str())) { - // Nothing to do here. Just log an error. - ICING_LOG(WARNING) << "Failed to recursively delete " - << temp_schema_store_dir_path.c_str(); - } - return absl_ports::InternalError( - "Unable to apply new schema due to failed swap!"); + ICING_RETURN_IF_ERROR(RegenerateDerivedFiles()); } - std::string old_base_dir = std::move(base_dir_); - *this = std::move(*new_schema_store); - - // After the std::move, the filepaths saved in this instance and in the - // schema_file_ instance will still be the one from temp_schema_store_dir - // even though they now point to files that are within old_base_dir. - // Manually set them to the correct paths. - base_dir_ = std::move(old_base_dir); - schema_file_->SetSwappedFilepath(MakeSchemaFilename(base_dir_)); - - return libtextclassifier3::Status::OK; + return result; } libtextclassifier3::StatusOr<const SchemaTypeConfigProto*> @@ -563,9 +457,12 @@ libtextclassifier3::Status SchemaStore::PersistToDisk() { SchemaStoreStorageInfoProto SchemaStore::GetStorageInfo() const { SchemaStoreStorageInfoProto storage_info; - int64_t directory_size = filesystem_->GetDiskUsage(base_dir_.c_str()); - storage_info.set_schema_store_size( - Filesystem::SanitizeFileSize(directory_size)); + int64_t directory_size = filesystem_.GetDiskUsage(base_dir_.c_str()); + if (directory_size != Filesystem::kBadFileSize) { + storage_info.set_schema_store_size(directory_size); + } else { + storage_info.set_schema_store_size(-1); + } ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema(), storage_info); storage_info.set_num_schema_types(schema->types_size()); int total_sections = 0; @@ -588,22 +485,5 @@ SchemaStoreStorageInfoProto SchemaStore::GetStorageInfo() const { return storage_info; } -libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*> -SchemaStore::GetSectionMetadata(const std::string& schema_type) const { - return section_manager_->GetMetadataList(schema_type); -} - -libtextclassifier3::StatusOr<SchemaDebugInfoProto> SchemaStore::GetDebugInfo() - const { - SchemaDebugInfoProto debug_info; - if (has_schema_successfully_set_) { - ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema()); - *debug_info.mutable_schema() = *schema; - } - ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum()); - debug_info.set_crc(crc.Get()); - return debug_info; -} - } // namespace lib } // namespace icing diff --git a/icing/schema/schema-store.h b/icing/schema/schema-store.h index 58e5477..dd1edb8 100644 --- a/icing/schema/schema-store.h +++ b/icing/schema/schema-store.h @@ -26,7 +26,6 @@ #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/file/file-backed-proto.h" #include "icing/file/filesystem.h" -#include "icing/proto/debug.pb.h" #include "icing/proto/document.pb.h" #include "icing/proto/logging.pb.h" #include "icing/proto/schema.pb.h" @@ -69,6 +68,9 @@ class SchemaStore { // to file. bool success = false; + // Whether the new schema changes invalidate the index. + bool index_incompatible = false; + // SchemaTypeIds of schema types can be reassigned new SchemaTypeIds if: // 1. Schema types are added in the middle of the SchemaProto // 2. Schema types are removed from the middle of the SchemaProto @@ -98,21 +100,6 @@ class SchemaStore { // SchemaUtil::ComputeCompatibilityDelta. Represented by the SchemaTypeId // assigned to this SchemaTypeConfigProto in the *old* schema. std::unordered_set<SchemaTypeId> schema_types_incompatible_by_id; - - // Schema types that were added in the new schema. Represented by the - // `schema_type` field in the SchemaTypeConfigProto. - std::unordered_set<std::string> schema_types_new_by_name; - - // Schema types that were changed in a way that was backwards compatible and - // didn't invalidate the index. Represented by the `schema_type` field in - // the SchemaTypeConfigProto. - std::unordered_set<std::string> - schema_types_changed_fully_compatible_by_name; - - // Schema types that were changed in a way that was backwards compatible, - // but invalidated the index. Represented by the `schema_type` field in the - // SchemaTypeConfigProto. - std::unordered_set<std::string> schema_types_index_incompatible_by_name; }; // Factory function to create a SchemaStore which does not take ownership @@ -130,17 +117,17 @@ class SchemaStore { static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create( const Filesystem* filesystem, const std::string& base_dir, const Clock* clock, InitializeStatsProto* initialize_stats = nullptr); - - SchemaStore(SchemaStore&&) = default; - SchemaStore& operator=(SchemaStore&&) = default; + // Not copyable SchemaStore(const SchemaStore&) = delete; SchemaStore& operator=(const SchemaStore&) = delete; // Persists and updates checksum of subcomponents. ~SchemaStore(); - // Retrieve the current schema if it exists. + // Retrieve the current schema if it exists. Caller does not get ownership of + // the schema proto and modifying the returned pointer does not affect the + // underlying schema proto. // // Returns: // SchemaProto* if exists @@ -247,70 +234,23 @@ class SchemaStore { // INTERNAL_ERROR on compute error libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const; - // Returns: - // - On success, the section metadata list for the specified schema type - // - NOT_FOUND if the schema type is not present in the schema - libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*> - GetSectionMetadata(const std::string& schema_type) const; - // Calculates the StorageInfo for the Schema Store. // // If an IO error occurs while trying to calculate the value for a field, then // that field will be set to -1. SchemaStoreStorageInfoProto GetStorageInfo() const; - // Get debug information for the schema store. - // - // Returns: - // SchemaDebugInfoProto on success - // INTERNAL_ERROR on IO errors, crc compute error - libtextclassifier3::StatusOr<SchemaDebugInfoProto> GetDebugInfo() const; - private: - // Factory function to create a SchemaStore and set its schema. The created - // instance does not take ownership of any input components and all pointers - // must refer to valid objects that outlive the created SchemaStore instance. - // The base_dir must already exist. No schema must have set in base_dir prior - // to this. - // - // Returns: - // A SchemaStore on success - // FAILED_PRECONDITION on any null pointer input or if there has already - // been a schema set for this path. - // INTERNAL_ERROR on any IO errors - static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create( - const Filesystem* filesystem, const std::string& base_dir, - const Clock* clock, SchemaProto schema); - - // Use SchemaStore::Create instead. explicit SchemaStore(const Filesystem* filesystem, std::string base_dir, const Clock* clock); - // Verifies that there is no error retrieving a previously set schema. Then - // initializes like normal. - // - // Returns: - // OK on success - // INTERNAL_ERROR on IO error - libtextclassifier3::Status Initialize(InitializeStatsProto* initialize_stats); - - // First, blindly writes new_schema to the schema_file. Then initializes like - // normal. - // - // Returns: - // OK on success - // INTERNAL_ERROR on IO error - // FAILED_PRECONDITION if there is already a schema set for the schema_file. - libtextclassifier3::Status Initialize(SchemaProto new_schema); - // Handles initializing the SchemaStore and regenerating any data if needed. // // Returns: // OK on success // INTERNAL_ERROR on IO error - libtextclassifier3::Status InitializeInternal( - InitializeStatsProto* initialize_stats); + libtextclassifier3::Status Initialize(InitializeStatsProto* initialize_stats); // Creates sub-components and verifies the integrity of each sub-component. // @@ -346,25 +286,15 @@ class SchemaStore { // Returns any IO errors. libtextclassifier3::Status ResetSchemaTypeMapper(); - // Creates a new schema store with new_schema and then swaps that new schema - // store with the existing one. This function guarantees that either: this - // instance will be fully updated to the new schema or no changes will take - // effect. - // - // Returns: - // OK on success - // INTERNAL on I/O error. - libtextclassifier3::Status ApplySchemaChange(SchemaProto new_schema); - libtextclassifier3::Status CheckSchemaSet() const { return has_schema_successfully_set_ ? libtextclassifier3::Status::OK : absl_ports::FailedPreconditionError("Schema not set yet."); } - const Filesystem* filesystem_; - std::string base_dir_; - const Clock* clock_; + const Filesystem& filesystem_; + const std::string base_dir_; + const Clock& clock_; // Used internally to indicate whether the class has been successfully // initialized with a valid schema. Will be false if Initialize failed or no @@ -372,7 +302,7 @@ class SchemaStore { bool has_schema_successfully_set_ = false; // Cached schema - std::unique_ptr<FileBackedProto<SchemaProto>> schema_file_; + FileBackedProto<SchemaProto> schema_file_; // A hash map of (type config name -> type config), allows faster lookup of // type config in schema. The O(1) type config access makes schema-related and diff --git a/icing/schema/schema-store_test.cc b/icing/schema/schema-store_test.cc index 3fd41c4..5ef2dea 100644 --- a/icing/schema/schema-store_test.cc +++ b/icing/schema/schema-store_test.cc @@ -21,9 +21,7 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/absl_ports/str_cat.h" -#include "icing/document-builder.h" #include "icing/file/filesystem.h" -#include "icing/file/mock-filesystem.h" #include "icing/portable/equals-proto.h" #include "icing/proto/document.pb.h" #include "icing/proto/schema.pb.h" @@ -35,7 +33,6 @@ #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" #include "icing/testing/tmp-directory.h" -#include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/util/crc32.h" namespace icing { @@ -47,35 +44,28 @@ using ::icing::lib::portable_equals_proto::EqualsProto; using ::testing::ElementsAre; using ::testing::Eq; using ::testing::Ge; -using ::testing::Gt; -using ::testing::HasSubstr; using ::testing::Not; using ::testing::Pointee; -using ::testing::Return; -using ::testing::SizeIs; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED = - PropertyConfigProto::Cardinality::REPEATED; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = + PropertyConfigProto_Cardinality_Code_OPTIONAL; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED = + PropertyConfigProto_Cardinality_Code_REPEATED; -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = + StringIndexingConfig_TokenizerType_Code_PLAIN; -constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; -constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; +constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; -constexpr PropertyConfigProto::DataType::Code TYPE_STRING = - PropertyConfigProto::DataType::STRING; -constexpr PropertyConfigProto::DataType::Code TYPE_DOUBLE = - PropertyConfigProto::DataType::DOUBLE; +constexpr PropertyConfigProto_DataType_Code TYPE_STRING = + PropertyConfigProto_DataType_Code_STRING; +constexpr PropertyConfigProto_DataType_Code TYPE_DOUBLE = + PropertyConfigProto_DataType_Code_DOUBLE; class SchemaStoreTest : public ::testing::Test { protected: - void SetUp() override { - temp_dir_ = GetTestTempDir() + "/icing"; - schema_store_dir_ = temp_dir_ + "/schema_store"; - filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()); + SchemaStoreTest() : test_dir_(GetTestTempDir() + "/icing") { + filesystem_.CreateDirectoryRecursively(test_dir_.c_str()); schema_ = SchemaBuilder() @@ -89,117 +79,30 @@ class SchemaStoreTest : public ::testing::Test { } void TearDown() override { - // Check that the schema store directory is the *only* directory in the - // schema_store_dir_. IOW, ensure that all temporary directories have been - // properly cleaned up. - std::vector<std::string> sub_dirs; - ASSERT_TRUE(filesystem_.ListDirectory(temp_dir_.c_str(), &sub_dirs)); - ASSERT_THAT(sub_dirs, ElementsAre("schema_store")); - - // Finally, clean everything up. - ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(temp_dir_.c_str())); + filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); } - Filesystem filesystem_; - std::string temp_dir_; - std::string schema_store_dir_; + const Filesystem filesystem_; + const std::string test_dir_; SchemaProto schema_; - FakeClock fake_clock_; + const FakeClock fake_clock_; }; TEST_F(SchemaStoreTest, CreationWithNullPointerShouldFail) { EXPECT_THAT( - SchemaStore::Create(/*filesystem=*/nullptr, schema_store_dir_, &fake_clock_), + SchemaStore::Create(/*filesystem=*/nullptr, test_dir_, &fake_clock_), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); } -TEST_F(SchemaStoreTest, SchemaStoreMoveConstructible) { - // Create an instance of SchemaStore. - SchemaProto schema = - SchemaBuilder() - .AddType(SchemaTypeConfigBuilder().SetType("TypeA").AddProperty( - PropertyConfigBuilder() - .SetName("prop1") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) - .Build(); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); - - ICING_ASSERT_OK(schema_store->SetSchema(schema)); - ICING_ASSERT_OK_AND_ASSIGN(Crc32 expected_checksum, - schema_store->ComputeChecksum()); - - // Move construct an instance of SchemaStore - SchemaStore move_constructed_schema_store(std::move(*schema_store)); - EXPECT_THAT(move_constructed_schema_store.GetSchema(), - IsOkAndHolds(Pointee(EqualsProto(schema)))); - EXPECT_THAT(move_constructed_schema_store.ComputeChecksum(), - IsOkAndHolds(Eq(expected_checksum))); - SectionMetadata expected_metadata(/*id_in=*/0, MATCH_EXACT, TOKENIZER_PLAIN, - "prop1"); - EXPECT_THAT(move_constructed_schema_store.GetSectionMetadata("TypeA"), - IsOkAndHolds(Pointee(ElementsAre(expected_metadata)))); -} - -TEST_F(SchemaStoreTest, SchemaStoreMoveAssignment) { - // Create an instance of SchemaStore. - SchemaProto schema1 = - SchemaBuilder() - .AddType(SchemaTypeConfigBuilder().SetType("TypeA").AddProperty( - PropertyConfigBuilder() - .SetName("prop1") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) - .Build(); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); - - ICING_ASSERT_OK(schema_store->SetSchema(schema1)); - ICING_ASSERT_OK_AND_ASSIGN(Crc32 expected_checksum, - schema_store->ComputeChecksum()); - - // Construct another instance of SchemaStore - SchemaProto schema2 = - SchemaBuilder() - .AddType(SchemaTypeConfigBuilder().SetType("TypeB").AddProperty( - PropertyConfigBuilder() - .SetName("prop2") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) - .Build(); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SchemaStore> move_assigned_schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); - ICING_ASSERT_OK(schema_store->SetSchema(schema2)); - - // Move assign the first instance into the second one. - *move_assigned_schema_store = std::move(*schema_store); - EXPECT_THAT(move_assigned_schema_store->GetSchema(), - IsOkAndHolds(Pointee(EqualsProto(schema1)))); - EXPECT_THAT(move_assigned_schema_store->ComputeChecksum(), - IsOkAndHolds(Eq(expected_checksum))); - SectionMetadata expected_metadata(/*id_in=*/0, MATCH_EXACT, TOKENIZER_PLAIN, - "prop1"); - EXPECT_THAT(move_assigned_schema_store->GetSectionMetadata("TypeA"), - IsOkAndHolds(Pointee(ElementsAre(expected_metadata)))); -} - TEST_F(SchemaStoreTest, CorruptSchemaError) { { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; - result.schema_types_new_by_name.insert(schema_.types(0).schema_type()); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -215,14 +118,14 @@ TEST_F(SchemaStoreTest, CorruptSchemaError) { .AddType(SchemaTypeConfigBuilder().SetType("corrupted")) .Build(); - const std::string schema_file = absl_ports::StrCat(schema_store_dir_, "/schema.pb"); + const std::string schema_file = absl_ports::StrCat(test_dir_, "/schema.pb"); const std::string serialized_schema = corrupt_schema.SerializeAsString(); filesystem_.Write(schema_file.c_str(), serialized_schema.data(), serialized_schema.size()); // If ground truth was corrupted, we won't know what to do - EXPECT_THAT(SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_), + EXPECT_THAT(SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_), StatusIs(libtextclassifier3::StatusCode::INTERNAL)); } @@ -230,12 +133,11 @@ TEST_F(SchemaStoreTest, RecoverCorruptDerivedFileOk) { { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; - result.schema_types_new_by_name.insert(schema_.types(0).schema_type()); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -250,12 +152,12 @@ TEST_F(SchemaStoreTest, RecoverCorruptDerivedFileOk) { // regenerated from ground truth const std::string schema_type_mapper_dir = - absl_ports::StrCat(schema_store_dir_, "/schema_type_mapper"); + absl_ports::StrCat(test_dir_, "/schema_type_mapper"); filesystem_.DeleteDirectoryRecursively(schema_type_mapper_dir.c_str()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); // Everything looks fine, ground truth and derived data ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -268,12 +170,11 @@ TEST_F(SchemaStoreTest, RecoverBadChecksumOk) { { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; - result.schema_types_new_by_name.insert(schema_.types(0).schema_type()); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -287,7 +188,7 @@ TEST_F(SchemaStoreTest, RecoverBadChecksumOk) { // the recalculated checksum on initialization. This will force a regeneration // of derived files from ground truth. const std::string header_file = - absl_ports::StrCat(schema_store_dir_, "/schema_store_header"); + absl_ports::StrCat(test_dir_, "/schema_store_header"); SchemaStore::Header header; header.magic = SchemaStore::Header::kMagic; header.checksum = 10; // Arbitrary garbage checksum @@ -296,7 +197,7 @@ TEST_F(SchemaStoreTest, RecoverBadChecksumOk) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); // Everything looks fine, ground truth and derived data ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -308,7 +209,7 @@ TEST_F(SchemaStoreTest, RecoverBadChecksumOk) { TEST_F(SchemaStoreTest, CreateNoPreviousSchemaOk) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); // The apis to retrieve information about the schema should fail gracefully. EXPECT_THAT(store->GetSchema(), @@ -341,16 +242,15 @@ TEST_F(SchemaStoreTest, CreateNoPreviousSchemaOk) { TEST_F(SchemaStoreTest, CreateWithPreviousSchemaOk) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); SchemaStore::SetSchemaResult result; result.success = true; - result.schema_types_new_by_name.insert(schema_.types(0).schema_type()); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); schema_store.reset(); - EXPECT_THAT(SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_), + EXPECT_THAT(SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_), IsOk()); } @@ -363,11 +263,10 @@ TEST_F(SchemaStoreTest, MultipleCreateOk) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); SchemaStore::SetSchemaResult result; result.success = true; - result.schema_types_new_by_name.insert(schema_.types(0).schema_type()); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); @@ -383,7 +282,7 @@ TEST_F(SchemaStoreTest, MultipleCreateOk) { schema_store.reset(); ICING_ASSERT_OK_AND_ASSIGN( - schema_store, SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + schema_store, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); // Verify that our in-memory structures are ok EXPECT_THAT(schema_store->GetSchemaTypeConfig("email"), @@ -399,12 +298,11 @@ TEST_F(SchemaStoreTest, MultipleCreateOk) { TEST_F(SchemaStoreTest, SetNewSchemaOk) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; - result.schema_types_new_by_name.insert(schema_.types(0).schema_type()); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -415,12 +313,11 @@ TEST_F(SchemaStoreTest, SetNewSchemaOk) { TEST_F(SchemaStoreTest, SetSameSchemaOk) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; - result.schema_types_new_by_name.insert(schema_.types(0).schema_type()); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -428,8 +325,6 @@ TEST_F(SchemaStoreTest, SetSameSchemaOk) { EXPECT_THAT(*actual_schema, EqualsProto(schema_)); // And one more for fun - result = SchemaStore::SetSchemaResult(); - result.success = true; EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema()); @@ -439,12 +334,11 @@ TEST_F(SchemaStoreTest, SetSameSchemaOk) { TEST_F(SchemaStoreTest, SetIncompatibleSchemaOk) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; - result.schema_types_new_by_name.insert(schema_.types(0).schema_type()); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -455,7 +349,6 @@ TEST_F(SchemaStoreTest, SetIncompatibleSchemaOk) { schema_.clear_types(); // Set the incompatible schema - result = SchemaStore::SetSchemaResult(); result.success = false; result.schema_types_deleted_by_name.emplace("email"); result.schema_types_deleted_by_id.emplace(0); @@ -466,7 +359,7 @@ TEST_F(SchemaStoreTest, SetIncompatibleSchemaOk) { TEST_F(SchemaStoreTest, SetSchemaWithAddedTypeOk) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); SchemaProto schema = SchemaBuilder() .AddType(SchemaTypeConfigBuilder().SetType("email")) @@ -475,7 +368,6 @@ TEST_F(SchemaStoreTest, SetSchemaWithAddedTypeOk) { // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; - result.schema_types_new_by_name.insert("email"); EXPECT_THAT(schema_store->SetSchema(schema), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -488,9 +380,6 @@ TEST_F(SchemaStoreTest, SetSchemaWithAddedTypeOk) { .Build(); // Set the compatible schema - result = SchemaStore::SetSchemaResult(); - result.success = true; - result.schema_types_new_by_name.insert("new_type"); EXPECT_THAT(schema_store->SetSchema(schema), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema()); @@ -500,7 +389,7 @@ TEST_F(SchemaStoreTest, SetSchemaWithAddedTypeOk) { TEST_F(SchemaStoreTest, SetSchemaWithDeletedTypeOk) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); SchemaProto schema = SchemaBuilder() @@ -511,8 +400,6 @@ TEST_F(SchemaStoreTest, SetSchemaWithDeletedTypeOk) { // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; - result.schema_types_new_by_name.insert("email"); - result.schema_types_new_by_name.insert("message"); EXPECT_THAT(schema_store->SetSchema(schema), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -558,7 +445,7 @@ TEST_F(SchemaStoreTest, SetSchemaWithDeletedTypeOk) { TEST_F(SchemaStoreTest, SetSchemaWithReorderedTypesOk) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); SchemaProto schema = SchemaBuilder() @@ -569,8 +456,6 @@ TEST_F(SchemaStoreTest, SetSchemaWithReorderedTypesOk) { // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; - result.schema_types_new_by_name.insert("email"); - result.schema_types_new_by_name.insert("message"); EXPECT_THAT(schema_store->SetSchema(schema), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -585,8 +470,6 @@ TEST_F(SchemaStoreTest, SetSchemaWithReorderedTypesOk) { // Since we assign SchemaTypeIds based on order in the SchemaProto, this will // cause SchemaTypeIds to change - result = SchemaStore::SetSchemaResult(); - result.success = true; result.old_schema_type_ids_changed.emplace(0); // Old SchemaTypeId of "email" result.old_schema_type_ids_changed.emplace( 1); // Old SchemaTypeId of "message" @@ -598,10 +481,10 @@ TEST_F(SchemaStoreTest, SetSchemaWithReorderedTypesOk) { EXPECT_THAT(*actual_schema, EqualsProto(schema)); } -TEST_F(SchemaStoreTest, IndexedPropertyChangeRequiresReindexingOk) { +TEST_F(SchemaStoreTest, SetSchemaThatRequiresReindexingOk) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); SchemaProto schema = SchemaBuilder() @@ -616,7 +499,6 @@ TEST_F(SchemaStoreTest, IndexedPropertyChangeRequiresReindexingOk) { // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; - result.schema_types_new_by_name.insert("email"); EXPECT_THAT(schema_store->SetSchema(schema), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -632,10 +514,10 @@ TEST_F(SchemaStoreTest, IndexedPropertyChangeRequiresReindexingOk) { .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); + // With a new indexed property, we'll need to reindex + result.index_incompatible = true; + // Set the compatible schema - result = SchemaStore::SetSchemaResult(); - result.success = true; - result.schema_types_index_incompatible_by_name.insert("email"); EXPECT_THAT(schema_store->SetSchema(schema), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema()); @@ -645,7 +527,7 @@ TEST_F(SchemaStoreTest, IndexedPropertyChangeRequiresReindexingOk) { TEST_F(SchemaStoreTest, IndexNestedDocumentsChangeRequiresReindexingOk) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); // Make two schemas. One that sets index_nested_properties to false and one // that sets it to true. @@ -682,8 +564,6 @@ TEST_F(SchemaStoreTest, IndexNestedDocumentsChangeRequiresReindexingOk) { // Set schema with index_nested_properties=false to start. SchemaStore::SetSchemaResult result; result.success = true; - result.schema_types_new_by_name.insert("email"); - result.schema_types_new_by_name.insert("person"); EXPECT_THAT(schema_store->SetSchema(no_nested_index_schema), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -694,7 +574,7 @@ TEST_F(SchemaStoreTest, IndexNestedDocumentsChangeRequiresReindexingOk) { // 'person' is index incompatible. result = SchemaStore::SetSchemaResult(); result.success = true; - result.schema_types_index_incompatible_by_name.insert("person"); + result.index_incompatible = true; EXPECT_THAT(schema_store->SetSchema(nested_index_schema), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema()); @@ -704,7 +584,7 @@ TEST_F(SchemaStoreTest, IndexNestedDocumentsChangeRequiresReindexingOk) { // to 'person' is index incompatible. result = SchemaStore::SetSchemaResult(); result.success = true; - result.schema_types_index_incompatible_by_name.insert("person"); + result.index_incompatible = true; EXPECT_THAT(schema_store->SetSchema(no_nested_index_schema), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema()); @@ -714,7 +594,7 @@ TEST_F(SchemaStoreTest, IndexNestedDocumentsChangeRequiresReindexingOk) { TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleTypesOk) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); SchemaProto schema = SchemaBuilder() @@ -729,7 +609,6 @@ TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleTypesOk) { // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; - result.schema_types_new_by_name.insert("email"); EXPECT_THAT(schema_store->SetSchema(schema), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -773,185 +652,10 @@ TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleTypesOk) { EXPECT_THAT(*actual_schema, EqualsProto(schema)); } -TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleNestedTypesOk) { - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); - - // 1. Create a ContactPoint type with a repeated property and set that schema - SchemaTypeConfigBuilder contact_point_repeated_label = - SchemaTypeConfigBuilder() - .SetType("ContactPoint") - .AddProperty(PropertyConfigBuilder() - .SetName("label") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_REPEATED)); - SchemaProto old_schema = - SchemaBuilder().AddType(contact_point_repeated_label).Build(); - ICING_EXPECT_OK(schema_store->SetSchema(old_schema)); - ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId old_contact_point_type_id, - schema_store->GetSchemaTypeId("ContactPoint")); - - // 2. Create a type that references the ContactPoint type and make a backwards - // incompatible change to ContactPoint - SchemaTypeConfigBuilder contact_point_optional_label = - SchemaTypeConfigBuilder() - .SetType("ContactPoint") - .AddProperty(PropertyConfigBuilder() - .SetName("label") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)); - SchemaTypeConfigBuilder person = - SchemaTypeConfigBuilder().SetType("Person").AddProperty( - PropertyConfigBuilder() - .SetName("contactPoints") - .SetDataTypeDocument("ContactPoint", - /*index_nested_properties=*/true) - .SetCardinality(CARDINALITY_REPEATED)); - SchemaProto new_schema = SchemaBuilder() - .AddType(contact_point_optional_label) - .AddType(person) - .Build(); - - // 3. SetSchema should fail with ignore_errors_and_delete_documents=false and - // the old schema should remain - SchemaStore::SetSchemaResult expected_result; - expected_result.success = false; - expected_result.schema_types_incompatible_by_name.insert("ContactPoint"); - expected_result.schema_types_incompatible_by_id.insert( - old_contact_point_type_id); - expected_result.schema_types_new_by_name.insert("Person"); - EXPECT_THAT( - schema_store->SetSchema(new_schema, - /*ignore_errors_and_delete_documents=*/false), - IsOkAndHolds(EqualsSetSchemaResult(expected_result))); - ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, - schema_store->GetSchema()); - EXPECT_THAT(*actual_schema, EqualsProto(old_schema)); - - // 4. SetSchema should succeed with ignore_errors_and_delete_documents=true - // and the new schema should be set - expected_result.success = true; - EXPECT_THAT( - schema_store->SetSchema(new_schema, - /*ignore_errors_and_delete_documents=*/true), - IsOkAndHolds(EqualsSetSchemaResult(expected_result))); - ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema()); - EXPECT_THAT(*actual_schema, EqualsProto(new_schema)); -} - -TEST_F(SchemaStoreTest, SetSchemaWithIndexIncompatibleNestedTypesOk) { - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); - - // 1. Create a ContactPoint type with label that matches prefix and set that - // schema - SchemaTypeConfigBuilder contact_point_prefix_label = - SchemaTypeConfigBuilder() - .SetType("ContactPoint") - .AddProperty(PropertyConfigBuilder() - .SetName("label") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_REPEATED)); - SchemaProto old_schema = - SchemaBuilder().AddType(contact_point_prefix_label).Build(); - ICING_EXPECT_OK(schema_store->SetSchema(old_schema)); - - // 2. Create a type that references the ContactPoint type and make a index - // backwards incompatible change to ContactPoint - SchemaTypeConfigBuilder contact_point_exact_label = - SchemaTypeConfigBuilder() - .SetType("ContactPoint") - .AddProperty(PropertyConfigBuilder() - .SetName("label") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_REPEATED)); - SchemaTypeConfigBuilder person = - SchemaTypeConfigBuilder().SetType("Person").AddProperty( - PropertyConfigBuilder() - .SetName("contactPoints") - .SetDataTypeDocument("ContactPoint", - /*index_nested_properties=*/true) - .SetCardinality(CARDINALITY_REPEATED)); - SchemaProto new_schema = SchemaBuilder() - .AddType(contact_point_exact_label) - .AddType(person) - .Build(); - - // SetSchema should succeed, and only ContactPoint should be in - // schema_types_index_incompatible_by_name. - SchemaStore::SetSchemaResult expected_result; - expected_result.success = true; - expected_result.schema_types_index_incompatible_by_name.insert( - "ContactPoint"); - expected_result.schema_types_new_by_name.insert("Person"); - EXPECT_THAT( - schema_store->SetSchema(new_schema, - /*ignore_errors_and_delete_documents=*/false), - IsOkAndHolds(EqualsSetSchemaResult(expected_result))); - ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, - schema_store->GetSchema()); - EXPECT_THAT(*actual_schema, EqualsProto(new_schema)); -} - -TEST_F(SchemaStoreTest, SetSchemaWithCompatibleNestedTypesOk) { - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); - - // 1. Create a ContactPoint type with a optional property and set that schema - SchemaTypeConfigBuilder contact_point_optional_label = - SchemaTypeConfigBuilder() - .SetType("ContactPoint") - .AddProperty(PropertyConfigBuilder() - .SetName("label") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)); - SchemaProto old_schema = - SchemaBuilder().AddType(contact_point_optional_label).Build(); - ICING_EXPECT_OK(schema_store->SetSchema(old_schema)); - - // 2. Create a type that references the ContactPoint type and make a backwards - // compatible change to ContactPoint - SchemaTypeConfigBuilder contact_point_repeated_label = - SchemaTypeConfigBuilder() - .SetType("ContactPoint") - .AddProperty(PropertyConfigBuilder() - .SetName("label") - .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_REPEATED)); - SchemaTypeConfigBuilder person = - SchemaTypeConfigBuilder().SetType("Person").AddProperty( - PropertyConfigBuilder() - .SetName("contactPoints") - .SetDataTypeDocument("ContactPoint", - /*index_nested_properties=*/true) - .SetCardinality(CARDINALITY_REPEATED)); - SchemaProto new_schema = SchemaBuilder() - .AddType(contact_point_repeated_label) - .AddType(person) - .Build(); - - // 3. SetSchema should succeed, and only ContactPoint should be in - // schema_types_changed_fully_compatible_by_name. - SchemaStore::SetSchemaResult expected_result; - expected_result.success = true; - expected_result.schema_types_changed_fully_compatible_by_name.insert( - "ContactPoint"); - expected_result.schema_types_new_by_name.insert("Person"); - EXPECT_THAT(schema_store->SetSchema( - new_schema, /*ignore_errors_and_delete_documents=*/false), - IsOkAndHolds(EqualsSetSchemaResult(expected_result))); - ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, - schema_store->GetSchema()); - EXPECT_THAT(*actual_schema, EqualsProto(new_schema)); -} - TEST_F(SchemaStoreTest, GetSchemaTypeId) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); schema_.clear_types(); @@ -967,8 +671,6 @@ TEST_F(SchemaStoreTest, GetSchemaTypeId) { // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; - result.schema_types_new_by_name.insert(first_type); - result.schema_types_new_by_name.insert(second_type); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); @@ -979,7 +681,7 @@ TEST_F(SchemaStoreTest, GetSchemaTypeId) { TEST_F(SchemaStoreTest, ComputeChecksumDefaultOnEmptySchemaStore) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); Crc32 default_checksum; EXPECT_THAT(schema_store->ComputeChecksum(), IsOkAndHolds(default_checksum)); @@ -988,7 +690,7 @@ TEST_F(SchemaStoreTest, ComputeChecksumDefaultOnEmptySchemaStore) { TEST_F(SchemaStoreTest, ComputeChecksumSameBetweenCalls) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); SchemaProto foo_schema = SchemaBuilder().AddType(SchemaTypeConfigBuilder().SetType("foo")).Build(); @@ -1004,7 +706,7 @@ TEST_F(SchemaStoreTest, ComputeChecksumSameBetweenCalls) { TEST_F(SchemaStoreTest, ComputeChecksumSameAcrossInstances) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); SchemaProto foo_schema = SchemaBuilder().AddType(SchemaTypeConfigBuilder().SetType("foo")).Build(); @@ -1017,14 +719,14 @@ TEST_F(SchemaStoreTest, ComputeChecksumSameAcrossInstances) { schema_store.reset(); ICING_ASSERT_OK_AND_ASSIGN( - schema_store, SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + schema_store, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); EXPECT_THAT(schema_store->ComputeChecksum(), IsOkAndHolds(checksum)); } TEST_F(SchemaStoreTest, ComputeChecksumChangesOnModification) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); SchemaProto foo_schema = SchemaBuilder().AddType(SchemaTypeConfigBuilder().SetType("foo")).Build(); @@ -1048,7 +750,7 @@ TEST_F(SchemaStoreTest, ComputeChecksumChangesOnModification) { TEST_F(SchemaStoreTest, PersistToDiskFineForEmptySchemaStore) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); // Persisting is fine and shouldn't affect anything ICING_EXPECT_OK(schema_store->PersistToDisk()); @@ -1057,7 +759,7 @@ TEST_F(SchemaStoreTest, PersistToDiskFineForEmptySchemaStore) { TEST_F(SchemaStoreTest, PersistToDiskPreservesAcrossInstances) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); SchemaProto schema = SchemaBuilder().AddType(SchemaTypeConfigBuilder().SetType("foo")).Build(); @@ -1082,7 +784,7 @@ TEST_F(SchemaStoreTest, PersistToDiskPreservesAcrossInstances) { // And we get the same schema back on reinitialization ICING_ASSERT_OK_AND_ASSIGN( - schema_store, SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + schema_store, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema()); EXPECT_THAT(*actual_schema, EqualsProto(schema)); } @@ -1090,7 +792,7 @@ TEST_F(SchemaStoreTest, PersistToDiskPreservesAcrossInstances) { TEST_F(SchemaStoreTest, SchemaStoreStorageInfoProto) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); // Create a schema with two types: one simple type and one type that uses all // 16 sections. @@ -1127,8 +829,6 @@ TEST_F(SchemaStoreTest, SchemaStoreStorageInfoProto) { SchemaStore::SetSchemaResult result; result.success = true; - result.schema_types_new_by_name.insert("email"); - result.schema_types_new_by_name.insert("fullSectionsType"); EXPECT_THAT(schema_store->SetSchema(schema), IsOkAndHolds(EqualsSetSchemaResult(result))); @@ -1139,114 +839,6 @@ TEST_F(SchemaStoreTest, SchemaStoreStorageInfoProto) { EXPECT_THAT(storage_info.num_schema_types_sections_exhausted(), Eq(1)); } -TEST_F(SchemaStoreTest, GetDebugInfo) { - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); - - // Set schema - ASSERT_THAT( - schema_store->SetSchema(schema_), - IsOkAndHolds(EqualsSetSchemaResult(SchemaStore::SetSchemaResult{ - .success = true, - .schema_types_new_by_name = {schema_.types(0).schema_type()}}))); - - // Check debug info - ICING_ASSERT_OK_AND_ASSIGN(SchemaDebugInfoProto out, - schema_store->GetDebugInfo()); - EXPECT_THAT(out.schema(), EqualsProto(schema_)); - EXPECT_THAT(out.crc(), Gt(0)); -} - -TEST_F(SchemaStoreTest, GetDebugInfoForEmptySchemaStore) { - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); - - // Check debug info before setting a schema - ICING_ASSERT_OK_AND_ASSIGN(SchemaDebugInfoProto out, - schema_store->GetDebugInfo()); - SchemaDebugInfoProto expected_out; - expected_out.set_crc(0); - EXPECT_THAT(out, EqualsProto(expected_out)); -} - -TEST_F(SchemaStoreTest, InitializeRegenerateDerivedFilesFailure) { - // This test covers the first point that RegenerateDerivedFiles could fail. - // This should simply result in SetSchema::Create returning an INTERNAL error. - - { - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); - SchemaProto schema = SchemaBuilder() - .AddType(SchemaTypeConfigBuilder().SetType("Type")) - .Build(); - ICING_ASSERT_OK(schema_store->SetSchema(std::move(schema))); - } - - auto mock_filesystem = std::make_unique<MockFilesystem>(); - ON_CALL(*mock_filesystem, - CreateDirectoryRecursively(HasSubstr("key_mapper_dir"))) - .WillByDefault(Return(false)); - { - EXPECT_THAT(SchemaStore::Create(mock_filesystem.get(), schema_store_dir_, - &fake_clock_), - StatusIs(libtextclassifier3::StatusCode::INTERNAL)); - } -} - -TEST_F(SchemaStoreTest, SetSchemaRegenerateDerivedFilesFailure) { - // This test covers the second point that RegenerateDerivedFiles could fail. - // If handled correctly, the schema store and section manager should still be - // in the original, valid state. - SchemaTypeConfigProto type = - SchemaTypeConfigBuilder() - .SetType("Type") - .AddProperty(PropertyConfigBuilder() - .SetName("prop1") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .Build(); - { - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); - SchemaProto schema = SchemaBuilder().AddType(type).Build(); - ICING_ASSERT_OK(schema_store->SetSchema(std::move(schema))); - } - - { - auto mock_filesystem = std::make_unique<MockFilesystem>(); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(mock_filesystem.get(), schema_store_dir_, - &fake_clock_)); - - ON_CALL(*mock_filesystem, - CreateDirectoryRecursively(HasSubstr("key_mapper_dir"))) - .WillByDefault(Return(false)); - SchemaProto schema = - SchemaBuilder() - .AddType(type) - .AddType(SchemaTypeConfigBuilder().SetType("Type2")) - .Build(); - EXPECT_THAT(schema_store->SetSchema(std::move(schema)), - StatusIs(libtextclassifier3::StatusCode::INTERNAL)); - DocumentProto document = DocumentBuilder() - .SetSchema("Type") - .AddStringProperty("prop1", "foo bar baz") - .Build(); - SectionMetadata expected_metadata(/*id_in=*/0, MATCH_EXACT, TOKENIZER_PLAIN, - "prop1"); - ICING_ASSERT_OK_AND_ASSIGN(std::vector<Section> sections, - schema_store->ExtractSections(document)); - ASSERT_THAT(sections, SizeIs(1)); - EXPECT_THAT(sections.at(0).metadata, Eq(expected_metadata)); - EXPECT_THAT(sections.at(0).content, ElementsAre("foo bar baz")); - } -} - } // namespace } // namespace lib diff --git a/icing/schema/schema-util.cc b/icing/schema/schema-util.cc index 88b6946..cabe76d 100644 --- a/icing/schema/schema-util.cc +++ b/icing/schema/schema-util.cc @@ -37,20 +37,6 @@ namespace lib { namespace { -bool ArePropertiesEqual(const PropertyConfigProto& old_property, - const PropertyConfigProto& new_property) { - return old_property.property_name() == new_property.property_name() && - old_property.data_type() == new_property.data_type() && - old_property.schema_type() == new_property.schema_type() && - old_property.cardinality() == new_property.cardinality() && - old_property.string_indexing_config().term_match_type() == - new_property.string_indexing_config().term_match_type() && - old_property.string_indexing_config().tokenizer_type() == - new_property.string_indexing_config().tokenizer_type() && - old_property.document_indexing_config().index_nested_properties() == - new_property.document_indexing_config().index_nested_properties(); -} - bool IsCardinalityCompatible(const PropertyConfigProto& old_property, const PropertyConfigProto& new_property) { if (old_property.cardinality() < new_property.cardinality()) { @@ -107,33 +93,6 @@ bool IsTermMatchTypeCompatible(const StringIndexingConfig& old_indexed, old_indexed.tokenizer_type() == new_indexed.tokenizer_type(); } -void AddIncompatibleChangeToDelta( - std::unordered_set<std::string>& incompatible_delta, - const SchemaTypeConfigProto& old_type_config, - const SchemaUtil::DependencyMap& new_schema_dependency_map, - const SchemaUtil::TypeConfigMap& old_type_config_map, - const SchemaUtil::TypeConfigMap& new_type_config_map) { - // If this type is incompatible, then every type that depends on it might - // also be incompatible. Use the dependency map to mark those ones as - // incompatible too. - incompatible_delta.insert(old_type_config.schema_type()); - auto parent_types_itr = - new_schema_dependency_map.find(old_type_config.schema_type()); - if (parent_types_itr != new_schema_dependency_map.end()) { - for (std::string_view parent_type : parent_types_itr->second) { - // The types from new_schema that depend on the current - // old_type_config may not present in old_schema. - // Those types will be listed at schema_delta.schema_types_new - // instead. - std::string parent_type_str(parent_type); - if (old_type_config_map.find(parent_type_str) != - old_type_config_map.end()) { - incompatible_delta.insert(std::move(parent_type_str)); - } - } - } -} - } // namespace libtextclassifier3::Status ExpandTranstiveDependencies( @@ -473,9 +432,9 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( const SchemaProto& old_schema, const SchemaProto& new_schema, const DependencyMap& new_schema_dependency_map) { SchemaDelta schema_delta; + schema_delta.index_incompatible = false; - TypeConfigMap old_type_config_map, new_type_config_map; - BuildTypeConfigMap(old_schema, &old_type_config_map); + TypeConfigMap new_type_config_map; BuildTypeConfigMap(new_schema, &new_type_config_map); // Iterate through and check each field of the old schema @@ -504,9 +463,6 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( // If there is a different number of properties, then there must have been a // change. - bool has_property_changed = - old_type_config.properties_size() != - new_schema_type_and_config->second.properties_size(); bool is_incompatible = false; bool is_index_incompatible = false; for (const auto& old_property_config : old_type_config.properties()) { @@ -542,11 +498,6 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( const PropertyConfigProto* new_property_config = new_property_name_and_config->second; - if (!has_property_changed && - !ArePropertiesEqual(old_property_config, *new_property_config)) { - // Finally found a property that changed. - has_property_changed = true; - } if (!IsPropertyCompatible(old_property_config, *new_property_config)) { ICING_VLOG(1) << absl_ports::StrCat( @@ -594,33 +545,26 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( } if (is_incompatible) { - AddIncompatibleChangeToDelta(schema_delta.schema_types_incompatible, - old_type_config, new_schema_dependency_map, - old_type_config_map, new_type_config_map); + // If this type is incompatible, then every type that depends on it might + // also be incompatible. Use the dependency map to mark those ones as + // incompatible too. + schema_delta.schema_types_incompatible.insert( + old_type_config.schema_type()); + auto parent_types_itr = + new_schema_dependency_map.find(old_type_config.schema_type()); + if (parent_types_itr != new_schema_dependency_map.end()) { + schema_delta.schema_types_incompatible.reserve( + schema_delta.schema_types_incompatible.size() + + parent_types_itr->second.size()); + schema_delta.schema_types_incompatible.insert( + parent_types_itr->second.begin(), parent_types_itr->second.end()); + } } if (is_index_incompatible) { - AddIncompatibleChangeToDelta(schema_delta.schema_types_index_incompatible, - old_type_config, new_schema_dependency_map, - old_type_config_map, new_type_config_map); + schema_delta.index_incompatible = true; } - if (!is_incompatible && !is_index_incompatible && has_property_changed) { - schema_delta.schema_types_changed_fully_compatible.insert( - old_type_config.schema_type()); - } - - // Lastly, remove this type from the map. We know that this type can't - // come up in future iterations through the old schema types because the old - // type config has unique types. - new_type_config_map.erase(old_type_config.schema_type()); - } - - // Any types that are still present in the new_type_config_map are newly added - // types. - schema_delta.schema_types_new.reserve(new_type_config_map.size()); - for (auto& kvp : new_type_config_map) { - schema_delta.schema_types_new.insert(std::move(kvp.first)); } return schema_delta; diff --git a/icing/schema/schema-util.h b/icing/schema/schema-util.h index fa80b15..abbc55d 100644 --- a/icing/schema/schema-util.h +++ b/icing/schema/schema-util.h @@ -41,6 +41,12 @@ class SchemaUtil { std::unordered_set<std::string_view>>; struct SchemaDelta { + // Whether an indexing config has changed, requiring the index to be + // regenerated. We don't list out all the types that make the index + // incompatible because our index isn't optimized for that. It's much easier + // to reset the entire index and reindex every document. + bool index_incompatible = false; + // Which schema types were present in the old schema, but were deleted from // the new schema. std::unordered_set<std::string> schema_types_deleted; @@ -49,28 +55,10 @@ class SchemaUtil { // could invalidate existing Documents of that schema type. std::unordered_set<std::string> schema_types_incompatible; - // Schema types that were added in the new schema. Represented by the - // `schema_type` field in the SchemaTypeConfigProto. - std::unordered_set<std::string> schema_types_new; - - // Schema types that were changed in a way that was backwards compatible and - // didn't invalidate the index. Represented by the `schema_type` field in - // the SchemaTypeConfigProto. - std::unordered_set<std::string> schema_types_changed_fully_compatible; - - // Schema types that were changed in a way that was backwards compatible, - // but invalidated the index. Represented by the `schema_type` field in the - // SchemaTypeConfigProto. - std::unordered_set<std::string> schema_types_index_incompatible; - bool operator==(const SchemaDelta& other) const { - return schema_types_deleted == other.schema_types_deleted && - schema_types_incompatible == other.schema_types_incompatible && - schema_types_new == other.schema_types_new && - schema_types_changed_fully_compatible == - other.schema_types_changed_fully_compatible && - schema_types_index_incompatible == - other.schema_types_index_incompatible; + return index_incompatible == other.index_incompatible && + schema_types_deleted == other.schema_types_deleted && + schema_types_incompatible == other.schema_types_incompatible; } }; diff --git a/icing/schema/schema-util_test.cc b/icing/schema/schema-util_test.cc index f28a2f8..049dd79 100644 --- a/icing/schema/schema-util_test.cc +++ b/icing/schema/schema-util_test.cc @@ -38,32 +38,32 @@ constexpr char kEmailType[] = "EmailMessage"; constexpr char kMessageType[] = "Text"; constexpr char kPersonType[] = "Person"; -constexpr PropertyConfigProto::DataType::Code TYPE_DOCUMENT = - PropertyConfigProto::DataType::DOCUMENT; -constexpr PropertyConfigProto::DataType::Code TYPE_STRING = - PropertyConfigProto::DataType::STRING; -constexpr PropertyConfigProto::DataType::Code TYPE_INT = - PropertyConfigProto::DataType::INT64; -constexpr PropertyConfigProto::DataType::Code TYPE_DOUBLE = - PropertyConfigProto::DataType::DOUBLE; - -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_UNKNOWN = - PropertyConfigProto::Cardinality::UNKNOWN; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED = - PropertyConfigProto::Cardinality::REQUIRED; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED = - PropertyConfigProto::Cardinality::REPEATED; - -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_NONE = - StringIndexingConfig::TokenizerType::NONE; -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; - -constexpr TermMatchType::Code MATCH_UNKNOWN = TermMatchType::UNKNOWN; -constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; -constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; +constexpr PropertyConfigProto_DataType_Code TYPE_DOCUMENT = + PropertyConfigProto_DataType_Code_DOCUMENT; +constexpr PropertyConfigProto_DataType_Code TYPE_STRING = + PropertyConfigProto_DataType_Code_STRING; +constexpr PropertyConfigProto_DataType_Code TYPE_INT = + PropertyConfigProto_DataType_Code_INT64; +constexpr PropertyConfigProto_DataType_Code TYPE_DOUBLE = + PropertyConfigProto_DataType_Code_DOUBLE; + +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_UNKNOWN = + PropertyConfigProto_Cardinality_Code_UNKNOWN; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED = + PropertyConfigProto_Cardinality_Code_REQUIRED; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = + PropertyConfigProto_Cardinality_Code_OPTIONAL; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED = + PropertyConfigProto_Cardinality_Code_REPEATED; + +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_NONE = + StringIndexingConfig_TokenizerType_Code_NONE; +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = + StringIndexingConfig_TokenizerType_Code_PLAIN; + +constexpr TermMatchType_Code MATCH_UNKNOWN = TermMatchType_Code_UNKNOWN; +constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; +constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; TEST(SchemaUtilTest, DependencyGraphAlphabeticalOrder) { // Create a schema with the following dependencies: @@ -705,7 +705,6 @@ TEST(SchemaUtilTest, NewOptionalPropertyIsCompatible) { .Build(); SchemaUtil::SchemaDelta schema_delta; - schema_delta.schema_types_changed_fully_compatible.insert(kEmailType); SchemaUtil::DependencyMap no_dependencies_map; EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta( old_schema, new_schema_with_optional, no_dependencies_map), @@ -818,8 +817,6 @@ TEST(SchemaUtilTest, CompatibilityOfDifferentCardinalityOk) { // We can have the new schema be less restrictive, OPTIONAL->REPEATED; SchemaUtil::SchemaDelta compatible_schema_delta; - compatible_schema_delta.schema_types_changed_fully_compatible.insert( - kEmailType); EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta( /*old_schema=*/more_restrictive_schema, /*new_schema=*/less_restrictive_schema, no_dependencies_map), @@ -915,6 +912,7 @@ TEST(SchemaUtilTest, DifferentSchemaTypeIsIncompatible) { SchemaUtil::SchemaDelta actual = SchemaUtil::ComputeCompatibilityDelta( old_schema, new_schema, dependencies_map); EXPECT_THAT(actual, Eq(schema_delta)); + EXPECT_THAT(actual.index_incompatible, testing::IsFalse()); EXPECT_THAT(actual.schema_types_incompatible, testing::ElementsAre(kEmailType)); EXPECT_THAT(actual.schema_types_deleted, testing::IsEmpty()); @@ -946,7 +944,7 @@ TEST(SchemaUtilTest, ChangingIndexedPropertiesMakesIndexIncompatible) { .Build(); SchemaUtil::SchemaDelta schema_delta; - schema_delta.schema_types_index_incompatible.insert(kPersonType); + schema_delta.index_incompatible = true; // New schema gained a new indexed property. SchemaUtil::DependencyMap no_dependencies_map; @@ -993,7 +991,7 @@ TEST(SchemaUtilTest, AddingNewIndexedPropertyMakesIndexIncompatible) { .Build(); SchemaUtil::SchemaDelta schema_delta; - schema_delta.schema_types_index_incompatible.insert(kPersonType); + schema_delta.index_incompatible = true; SchemaUtil::DependencyMap no_dependencies_map; EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema, no_dependencies_map), @@ -1033,7 +1031,6 @@ TEST(SchemaUtilTest, AddingTypeIsCompatible) { .Build(); SchemaUtil::SchemaDelta schema_delta; - schema_delta.schema_types_new.insert(kEmailType); SchemaUtil::DependencyMap no_dependencies_map; EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema, no_dependencies_map), @@ -1112,7 +1109,7 @@ TEST(SchemaUtilTest, DeletingPropertyAndChangingProperty) { SchemaUtil::SchemaDelta schema_delta; schema_delta.schema_types_incompatible.emplace(kEmailType); - schema_delta.schema_types_index_incompatible.emplace(kEmailType); + schema_delta.index_incompatible = true; SchemaUtil::DependencyMap no_dependencies_map; SchemaUtil::SchemaDelta actual = SchemaUtil::ComputeCompatibilityDelta( old_schema, new_schema, no_dependencies_map); @@ -1160,7 +1157,7 @@ TEST(SchemaUtilTest, IndexNestedDocumentsIndexIncompatible) { // should make kPersonType index_incompatible. kEmailType should be // unaffected. SchemaUtil::SchemaDelta schema_delta; - schema_delta.schema_types_index_incompatible.emplace(kPersonType); + schema_delta.index_incompatible = true; SchemaUtil::DependencyMap dependencies_map = {{kEmailType, {kPersonType}}}; SchemaUtil::SchemaDelta actual = SchemaUtil::ComputeCompatibilityDelta( no_nested_index_schema, nested_index_schema, dependencies_map); diff --git a/icing/schema/section.h b/icing/schema/section.h index 8b2ba55..40e623a 100644 --- a/icing/schema/section.h +++ b/icing/schema/section.h @@ -77,11 +77,6 @@ struct SectionMetadata { id(id_in), tokenizer(tokenizer), term_match_type(term_match_type_in) {} - - bool operator==(const SectionMetadata& rhs) const { - return path == rhs.path && id == rhs.id && tokenizer == rhs.tokenizer && - term_match_type == rhs.term_match_type; - } }; // Section is an icing internal concept similar to document property but with diff --git a/icing/scoring/bm25f-calculator.cc b/icing/scoring/bm25f-calculator.cc index 28d385e..4822d7f 100644 --- a/icing/scoring/bm25f-calculator.cc +++ b/icing/scoring/bm25f-calculator.cc @@ -26,7 +26,6 @@ #include "icing/store/corpus-associated-scoring-data.h" #include "icing/store/corpus-id.h" #include "icing/store/document-associated-score-data.h" -#include "icing/store/document-filter-data.h" #include "icing/store/document-id.h" namespace icing { @@ -43,11 +42,8 @@ constexpr float k1_ = 1.2f; constexpr float b_ = 0.7f; // TODO(b/158603900): add tests for Bm25fCalculator -Bm25fCalculator::Bm25fCalculator( - const DocumentStore* document_store, - std::unique_ptr<SectionWeights> section_weights) - : document_store_(document_store), - section_weights_(std::move(section_weights)) {} +Bm25fCalculator::Bm25fCalculator(const DocumentStore* document_store) + : document_store_(document_store) {} // During initialization, Bm25fCalculator iterates through // hit-iterators for each query term to pre-compute n(q_i) for each corpus under @@ -125,9 +121,9 @@ float Bm25fCalculator::ComputeScore(const DocHitInfoIterator* query_it, // Compute inverse document frequency (IDF) weight for query term in the given // corpus, and cache it in the map. // -// N - n(q_i) + 0.5 -// IDF(q_i) = ln(1 + ------------------) -// n(q_i) + 0.5 +// N - n(q_i) + 0.5 +// IDF(q_i) = log(1 + ------------------) +// n(q_i) + 0.5 // // where N is the number of documents in the corpus, and n(q_i) is the number // of documents in the corpus containing the query term q_i. @@ -153,7 +149,7 @@ float Bm25fCalculator::GetCorpusIdfWeightForTerm(std::string_view term, uint32_t num_docs = csdata.num_docs(); uint32_t nqi = corpus_nqi_map_[corpus_term_info.value]; float idf = - nqi != 0 ? log(1.0f + (num_docs - nqi + 0.5f) / (nqi + 0.5f)) : 0.0f; + nqi != 0 ? log(1.0f + (num_docs - nqi + 0.5f) / (nqi - 0.5f)) : 0.0f; corpus_idf_map_.insert({corpus_term_info.value, idf}); ICING_VLOG(1) << IcingStringUtil::StringPrintf( "corpus_id:%d term:%s N:%d nqi:%d idf:%f", corpus_id, @@ -162,11 +158,6 @@ float Bm25fCalculator::GetCorpusIdfWeightForTerm(std::string_view term, } // Get per corpus average document length and cache the result in the map. -// The average doc length is calculated as: -// -// total_tokens_in_corpus -// Avg Doc Length = ------------------------- -// num_docs_in_corpus + 1 float Bm25fCalculator::GetCorpusAvgDocLength(CorpusId corpus_id) { auto iter = corpus_avgdl_map_.find(corpus_id); if (iter != corpus_avgdl_map_.end()) { @@ -200,8 +191,8 @@ float Bm25fCalculator::ComputedNormalizedTermFrequency( const DocumentAssociatedScoreData& data) { uint32_t dl = data.length_in_tokens(); float avgdl = GetCorpusAvgDocLength(data.corpus_id()); - float f_q = ComputeTermFrequencyForMatchedSections( - data.corpus_id(), term_match_info, hit_info.document_id()); + float f_q = + ComputeTermFrequencyForMatchedSections(data.corpus_id(), term_match_info); float normalized_tf = f_q * (k1_ + 1) / (f_q + k1_ * (1 - b_ + b_ * dl / avgdl)); @@ -211,41 +202,23 @@ float Bm25fCalculator::ComputedNormalizedTermFrequency( return normalized_tf; } +// Note: once we support section weights, we should update this function to +// compute the weighted term frequency. float Bm25fCalculator::ComputeTermFrequencyForMatchedSections( - CorpusId corpus_id, const TermMatchInfo& term_match_info, - DocumentId document_id) const { + CorpusId corpus_id, const TermMatchInfo& term_match_info) const { float sum = 0.0f; SectionIdMask sections = term_match_info.section_ids_mask; - SchemaTypeId schema_type_id = GetSchemaTypeId(document_id); - while (sections != 0) { SectionId section_id = __builtin_ctz(sections); sections &= ~(1u << section_id); Hit::TermFrequency tf = term_match_info.term_frequencies[section_id]; - double weighted_tf = tf * section_weights_->GetNormalizedSectionWeight( - schema_type_id, section_id); if (tf != Hit::kNoTermFrequency) { - sum += weighted_tf; + sum += tf; } } return sum; } -SchemaTypeId Bm25fCalculator::GetSchemaTypeId(DocumentId document_id) const { - auto filter_data_or = document_store_->GetDocumentFilterData(document_id); - if (!filter_data_or.ok()) { - // This should never happen. The only failure case for - // GetDocumentFilterData is if the document_id is outside of the range of - // allocated document_ids, which shouldn't be possible since we're getting - // this document_id from the posting lists. - ICING_LOG(WARNING) << IcingStringUtil::StringPrintf( - "No document filter data for document [%d]", document_id); - return kInvalidSchemaTypeId; - } - DocumentFilterData data = filter_data_or.ValueOrDie(); - return data.schema_type_id(); -} - } // namespace lib } // namespace icing diff --git a/icing/scoring/bm25f-calculator.h b/icing/scoring/bm25f-calculator.h index 05009d8..91b4f24 100644 --- a/icing/scoring/bm25f-calculator.h +++ b/icing/scoring/bm25f-calculator.h @@ -22,7 +22,6 @@ #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/legacy/index/icing-bit-util.h" -#include "icing/scoring/section-weights.h" #include "icing/store/corpus-id.h" #include "icing/store/document-store.h" @@ -63,8 +62,7 @@ namespace lib { // see: glossary/bm25 class Bm25fCalculator { public: - explicit Bm25fCalculator(const DocumentStore *document_store_, - std::unique_ptr<SectionWeights> section_weights_); + explicit Bm25fCalculator(const DocumentStore *document_store_); // Precompute and cache statistics relevant to BM25F. // Populates term_id_map_ and corpus_nqi_map_ for use while scoring other @@ -110,43 +108,18 @@ class Bm25fCalculator { } }; - // Returns idf weight for the term and provided corpus. float GetCorpusIdfWeightForTerm(std::string_view term, CorpusId corpus_id); - - // Returns the average document length for the corpus. The average is - // calculated as the sum of tokens in the corpus' documents over the total - // number of documents plus one. float GetCorpusAvgDocLength(CorpusId corpus_id); - - // Returns the normalized term frequency for the term match and document hit. - // This normalizes the term frequency by applying smoothing parameters and - // factoring document length. float ComputedNormalizedTermFrequency( const TermMatchInfo &term_match_info, const DocHitInfo &hit_info, const DocumentAssociatedScoreData &data); - - // Returns the weighted term frequency for the term match and document. For - // each section the term is present, we scale the term frequency by its - // section weight. We return the sum of the weighted term frequencies over all - // sections. float ComputeTermFrequencyForMatchedSections( - CorpusId corpus_id, const TermMatchInfo &term_match_info, - DocumentId document_id) const; + CorpusId corpus_id, const TermMatchInfo &term_match_info) const; - // Returns the schema type id for the document by retrieving it from the - // DocumentFilterData. - SchemaTypeId GetSchemaTypeId(DocumentId document_id) const; - - // Clears cached scoring data and prepares the calculator for a new scoring - // run. void Clear(); const DocumentStore *document_store_; // Does not own. - // Used for accessing normalized section weights when computing the weighted - // term frequency. - std::unique_ptr<SectionWeights> section_weights_; - // Map from query term to compact term ID. // Necessary as a key to the other maps. // The use of the string_view as key here means that the query_term_iterators @@ -157,6 +130,7 @@ class Bm25fCalculator { // Necessary to calculate the normalized term frequency. // This information is cached in the DocumentStore::CorpusScoreCache std::unordered_map<CorpusId, float> corpus_avgdl_map_; + // Map from <corpus ID, term ID> to number of documents containing term q_i, // called n(q_i). // Necessary to calculate IDF(q_i) (inverse document frequency). diff --git a/icing/scoring/ranker.cc b/icing/scoring/ranker.cc index 117f44c..fecee82 100644 --- a/icing/scoring/ranker.cc +++ b/icing/scoring/ranker.cc @@ -32,7 +32,6 @@ namespace { // Helper function to wrap the heapify algorithm, it heapifies the target // subtree node in place. -// TODO(b/152934343) refactor the heapify function and making it into a class. void Heapify( std::vector<ScoredDocumentHit>* scored_document_hits, int target_subtree_root_index, @@ -72,80 +71,6 @@ void Heapify( } } -// Heapify the given term vector from top to bottom. Call it after add or -// replace an element at the front of the vector. -void HeapifyTermDown(std::vector<TermMetadata>& scored_terms, - int target_subtree_root_index) { - int heap_size = scored_terms.size(); - if (target_subtree_root_index >= heap_size) { - return; - } - - // Initializes subtree root as the current minimum node. - int min = target_subtree_root_index; - // If we represent a heap in an array/vector, indices of left and right - // children can be calculated as such. - const int left = target_subtree_root_index * 2 + 1; - const int right = target_subtree_root_index * 2 + 2; - - // If left child is smaller than current minimum. - if (left < heap_size && - scored_terms.at(left).hit_count < scored_terms.at(min).hit_count) { - min = left; - } - - // If right child is smaller than current minimum. - if (right < heap_size && - scored_terms.at(right).hit_count < scored_terms.at(min).hit_count) { - min = right; - } - - // If the minimum is not the subtree root, swap and continue heapifying the - // lower level subtree. - if (min != target_subtree_root_index) { - std::swap(scored_terms.at(min), - scored_terms.at(target_subtree_root_index)); - HeapifyTermDown(scored_terms, min); - } -} - -// Heapify the given term vector from bottom to top. Call it after add an -// element at the end of the vector. -void HeapifyTermUp(std::vector<TermMetadata>& scored_terms, - int target_subtree_child_index) { - // If we represent a heap in an array/vector, indices of root can be - // calculated as such. - const int root = (target_subtree_child_index + 1) / 2 - 1; - - // If the current child is smaller than the root, swap and continue heapifying - // the upper level subtree - if (root >= 0 && scored_terms.at(target_subtree_child_index).hit_count < - scored_terms.at(root).hit_count) { - std::swap(scored_terms.at(root), - scored_terms.at(target_subtree_child_index)); - HeapifyTermUp(scored_terms, root); - } -} - -TermMetadata PopRootTerm(std::vector<TermMetadata>& scored_terms) { - if (scored_terms.empty()) { - // Return an invalid TermMetadata as a sentinel value. - return TermMetadata(/*content_in=*/"", /*hit_count_in=*/-1); - } - - // Steps to extract root from heap: - // 1. copy out root - TermMetadata root = scored_terms.at(0); - const size_t last_node_index = scored_terms.size() - 1; - // 2. swap root and the last node - std::swap(scored_terms.at(0), scored_terms.at(last_node_index)); - // 3. remove last node - scored_terms.pop_back(); - // 4. heapify root - HeapifyTermDown(scored_terms, /*target_subtree_root_index=*/0); - return root; -} - // Helper function to extract the root from the heap. The heap structure will be // maintained. // @@ -190,19 +115,6 @@ void BuildHeapInPlace( } } -void PushToTermHeap(TermMetadata term, int number_to_return, - std::vector<TermMetadata>& scored_terms_heap) { - if (scored_terms_heap.size() < number_to_return) { - scored_terms_heap.push_back(std::move(term)); - // We insert at end, so we should heapify bottom up. - HeapifyTermUp(scored_terms_heap, scored_terms_heap.size() - 1); - } else if (scored_terms_heap.at(0).hit_count < term.hit_count) { - scored_terms_heap.at(0) = std::move(term); - // We insert at root, so we should heapify top down. - HeapifyTermDown(scored_terms_heap, /*target_subtree_root_index=*/0); - } -} - std::vector<ScoredDocumentHit> PopTopResultsFromHeap( std::vector<ScoredDocumentHit>* scored_document_hits_heap, int num_results, const ScoredDocumentHitComparator& scored_document_hit_comparator) { @@ -222,15 +134,5 @@ std::vector<ScoredDocumentHit> PopTopResultsFromHeap( return scored_document_hit_result; } -std::vector<TermMetadata> PopAllTermsFromHeap( - std::vector<TermMetadata>& scored_terms_heap) { - std::vector<TermMetadata> top_term_result; - top_term_result.reserve(scored_terms_heap.size()); - while (!scored_terms_heap.empty()) { - top_term_result.push_back(PopRootTerm(scored_terms_heap)); - } - return top_term_result; -} - } // namespace lib } // namespace icing diff --git a/icing/scoring/ranker.h b/icing/scoring/ranker.h index 81838f3..785c133 100644 --- a/icing/scoring/ranker.h +++ b/icing/scoring/ranker.h @@ -17,7 +17,6 @@ #include <vector> -#include "icing/index/term-metadata.h" #include "icing/scoring/scored-document-hit.h" // Provides functionality to get the top N results from an unsorted vector. @@ -40,18 +39,6 @@ std::vector<ScoredDocumentHit> PopTopResultsFromHeap( std::vector<ScoredDocumentHit>* scored_document_hits_heap, int num_results, const ScoredDocumentHitComparator& scored_document_hit_comparator); -// The heap is a min-heap. So that we can avoid some push operations by -// comparing to the root term, and only pushing if greater than root. The time -// complexity for a single push is O(lgK) which K is the number_to_return. -// REQUIRED: scored_terms_heap is not null. -void PushToTermHeap(TermMetadata term, int number_to_return, - std::vector<TermMetadata>& scored_terms_heap); - -// Return all terms from the given terms heap. And since the heap is a min-heap, -// the output vector will be increasing order. -// REQUIRED: scored_terms_heap is not null. -std::vector<TermMetadata> PopAllTermsFromHeap( - std::vector<TermMetadata>& scored_terms_heap); } // namespace lib } // namespace icing diff --git a/icing/scoring/score-and-rank_benchmark.cc b/icing/scoring/score-and-rank_benchmark.cc index cc1d995..e940e98 100644 --- a/icing/scoring/score-and-rank_benchmark.cc +++ b/icing/scoring/score-and-rank_benchmark.cc @@ -117,8 +117,7 @@ void BM_ScoreAndRankDocumentHitsByDocumentScore(benchmark::State& state) { scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ScoringProcessor> scoring_processor, - ScoringProcessor::Create(scoring_spec, document_store.get(), - schema_store.get())); + ScoringProcessor::Create(scoring_spec, document_store.get())); int num_to_score = state.range(0); int num_of_documents = state.range(1); @@ -221,8 +220,7 @@ void BM_ScoreAndRankDocumentHitsByCreationTime(benchmark::State& state) { ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ScoringProcessor> scoring_processor, - ScoringProcessor::Create(scoring_spec, document_store.get(), - schema_store.get())); + ScoringProcessor::Create(scoring_spec, document_store.get())); int num_to_score = state.range(0); int num_of_documents = state.range(1); @@ -324,8 +322,7 @@ void BM_ScoreAndRankDocumentHitsNoScoring(benchmark::State& state) { scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::NONE); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ScoringProcessor> scoring_processor, - ScoringProcessor::Create(scoring_spec, document_store.get(), - schema_store.get())); + ScoringProcessor::Create(scoring_spec, document_store.get())); int num_to_score = state.range(0); int num_of_documents = state.range(1); @@ -393,122 +390,6 @@ BENCHMARK(BM_ScoreAndRankDocumentHitsNoScoring) ->ArgPair(10000, 18000) ->ArgPair(10000, 20000); -void BM_ScoreAndRankDocumentHitsByRelevanceScoring(benchmark::State& state) { - const std::string base_dir = GetTestTempDir() + "/score_and_rank_benchmark"; - const std::string document_store_dir = base_dir + "/document_store"; - const std::string schema_store_dir = base_dir + "/schema_store"; - - // Creates file directories - Filesystem filesystem; - filesystem.DeleteDirectoryRecursively(base_dir.c_str()); - filesystem.CreateDirectoryRecursively(document_store_dir.c_str()); - filesystem.CreateDirectoryRecursively(schema_store_dir.c_str()); - - Clock clock; - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem, base_dir, &clock)); - - ICING_ASSERT_OK_AND_ASSIGN( - DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem, document_store_dir, &clock, - schema_store.get())); - std::unique_ptr<DocumentStore> document_store = - std::move(create_result.document_store); - - ICING_ASSERT_OK(schema_store->SetSchema(CreateSchemaWithEmailType())); - - ScoringSpecProto scoring_spec; - scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<ScoringProcessor> scoring_processor, - ScoringProcessor::Create(scoring_spec, document_store.get(), - schema_store.get())); - - int num_to_score = state.range(0); - int num_of_documents = state.range(1); - - std::mt19937 random_generator; - std::uniform_int_distribution<int> distribution( - 1, std::numeric_limits<int>::max()); - - SectionId section_id = 0; - SectionIdMask section_id_mask = 1U << section_id; - - // Puts documents into document store - std::vector<DocHitInfo> doc_hit_infos; - for (int i = 0; i < num_of_documents; i++) { - ICING_ASSERT_OK_AND_ASSIGN( - DocumentId document_id, - document_store->Put(CreateEmailDocument( - /*id=*/i, /*document_score=*/1, - /*creation_timestamp_ms=*/1), - /*num_tokens=*/10)); - DocHitInfo doc_hit = DocHitInfo(document_id, section_id_mask); - // Set five matches for term "foo" for each document hit. - doc_hit.UpdateSection(section_id, /*hit_term_frequency=*/5); - doc_hit_infos.push_back(doc_hit); - } - - ScoredDocumentHitComparator scored_document_hit_comparator( - /*is_descending=*/true); - - for (auto _ : state) { - // Creates a dummy DocHitInfoIterator with results, we need to pause the - // timer here so that the cost of copying test data is not included. - state.PauseTiming(); - std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator = - std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo"); - // Create a query term iterator that assigns the document hits to term - // "foo". - std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>> - query_term_iterators; - query_term_iterators["foo"] = - std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo"); - state.ResumeTiming(); - - std::vector<ScoredDocumentHit> scored_document_hits = - scoring_processor->Score(std::move(doc_hit_info_iterator), num_to_score, - &query_term_iterators); - - BuildHeapInPlace(&scored_document_hits, scored_document_hit_comparator); - // Ranks and gets the first page, 20 is a common page size - std::vector<ScoredDocumentHit> results = - PopTopResultsFromHeap(&scored_document_hits, /*num_results=*/20, - scored_document_hit_comparator); - } - - // Clean up - document_store.reset(); - schema_store.reset(); - filesystem.DeleteDirectoryRecursively(base_dir.c_str()); -} -BENCHMARK(BM_ScoreAndRankDocumentHitsByRelevanceScoring) - // num_to_score, num_of_documents in document store - ->ArgPair(1000, 30000) - ->ArgPair(3000, 30000) - ->ArgPair(5000, 30000) - ->ArgPair(7000, 30000) - ->ArgPair(9000, 30000) - ->ArgPair(11000, 30000) - ->ArgPair(13000, 30000) - ->ArgPair(15000, 30000) - ->ArgPair(17000, 30000) - ->ArgPair(19000, 30000) - ->ArgPair(21000, 30000) - ->ArgPair(23000, 30000) - ->ArgPair(25000, 30000) - ->ArgPair(27000, 30000) - ->ArgPair(29000, 30000) - // Starting from this line, we're trying to see if num_of_documents affects - // performance - ->ArgPair(10000, 10000) - ->ArgPair(10000, 12000) - ->ArgPair(10000, 14000) - ->ArgPair(10000, 16000) - ->ArgPair(10000, 18000) - ->ArgPair(10000, 20000); - } // namespace } // namespace lib diff --git a/icing/scoring/scorer.cc b/icing/scoring/scorer.cc index 5f33e66..a4734b4 100644 --- a/icing/scoring/scorer.cc +++ b/icing/scoring/scorer.cc @@ -22,7 +22,6 @@ #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/proto/scoring.pb.h" #include "icing/scoring/bm25f-calculator.h" -#include "icing/scoring/section-weights.h" #include "icing/store/document-id.h" #include "icing/store/document-store.h" #include "icing/util/status-macros.h" @@ -157,12 +156,11 @@ class NoScorer : public Scorer { }; libtextclassifier3::StatusOr<std::unique_ptr<Scorer>> Scorer::Create( - const ScoringSpecProto& scoring_spec, double default_score, - const DocumentStore* document_store, const SchemaStore* schema_store) { + ScoringSpecProto::RankingStrategy::Code rank_by, double default_score, + const DocumentStore* document_store) { ICING_RETURN_ERROR_IF_NULL(document_store); - ICING_RETURN_ERROR_IF_NULL(schema_store); - switch (scoring_spec.rank_by()) { + switch (rank_by) { case ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE: return std::make_unique<DocumentScoreScorer>(document_store, default_score); @@ -170,12 +168,7 @@ libtextclassifier3::StatusOr<std::unique_ptr<Scorer>> Scorer::Create( return std::make_unique<DocumentCreationTimestampScorer>(document_store, default_score); case ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE: { - ICING_ASSIGN_OR_RETURN( - std::unique_ptr<SectionWeights> section_weights, - SectionWeights::Create(schema_store, scoring_spec)); - - auto bm25f_calculator = std::make_unique<Bm25fCalculator>( - document_store, std::move(section_weights)); + auto bm25f_calculator = std::make_unique<Bm25fCalculator>(document_store); return std::make_unique<RelevanceScoreScorer>(std::move(bm25f_calculator), default_score); } @@ -190,8 +183,8 @@ libtextclassifier3::StatusOr<std::unique_ptr<Scorer>> Scorer::Create( case ScoringSpecProto::RankingStrategy::USAGE_TYPE2_LAST_USED_TIMESTAMP: [[fallthrough]]; case ScoringSpecProto::RankingStrategy::USAGE_TYPE3_LAST_USED_TIMESTAMP: - return std::make_unique<UsageScorer>( - document_store, scoring_spec.rank_by(), default_score); + return std::make_unique<UsageScorer>(document_store, rank_by, + default_score); case ScoringSpecProto::RankingStrategy::NONE: return std::make_unique<NoScorer>(default_score); } diff --git a/icing/scoring/scorer.h b/icing/scoring/scorer.h index abdd5ca..a22db0f 100644 --- a/icing/scoring/scorer.h +++ b/icing/scoring/scorer.h @@ -43,8 +43,8 @@ class Scorer { // FAILED_PRECONDITION on any null pointer input // INVALID_ARGUMENT if fails to create an instance static libtextclassifier3::StatusOr<std::unique_ptr<Scorer>> Create( - const ScoringSpecProto& scoring_spec, double default_score, - const DocumentStore* document_store, const SchemaStore* schema_store); + ScoringSpecProto::RankingStrategy::Code rank_by, double default_score, + const DocumentStore* document_store); // Returns a non-negative score of a document. The score can be a // document-associated score which comes from the DocumentProto directly, an diff --git a/icing/scoring/scorer_test.cc b/icing/scoring/scorer_test.cc index fef612d..8b89514 100644 --- a/icing/scoring/scorer_test.cc +++ b/icing/scoring/scorer_test.cc @@ -27,7 +27,6 @@ #include "icing/proto/scoring.pb.h" #include "icing/schema-builder.h" #include "icing/schema/schema-store.h" -#include "icing/scoring/section-weights.h" #include "icing/store/document-id.h" #include "icing/store/document-store.h" #include "icing/testing/common-matchers.h" @@ -40,11 +39,11 @@ namespace lib { namespace { using ::testing::Eq; -constexpr PropertyConfigProto::DataType::Code TYPE_STRING = - PropertyConfigProto::DataType::STRING; +constexpr PropertyConfigProto_DataType_Code TYPE_STRING = + PropertyConfigProto_DataType_Code_STRING; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED = - PropertyConfigProto::Cardinality::REQUIRED; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED = + PropertyConfigProto_Cardinality_Code_REQUIRED; class ScorerTest : public testing::Test { protected: @@ -92,8 +91,6 @@ class ScorerTest : public testing::Test { DocumentStore* document_store() { return document_store_.get(); } - SchemaStore* schema_store() { return schema_store_.get(); } - const FakeClock& fake_clock1() { return fake_clock1_; } const FakeClock& fake_clock2() { return fake_clock2_; } @@ -124,37 +121,17 @@ UsageReport CreateUsageReport(std::string name_space, std::string uri, return usage_report; } -ScoringSpecProto CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy::Code ranking_strategy) { - ScoringSpecProto scoring_spec; - scoring_spec.set_rank_by(ranking_strategy); - return scoring_spec; -} - -TEST_F(ScorerTest, CreationWithNullDocumentStoreShouldFail) { - EXPECT_THAT( - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE), - /*default_score=*/0, /*document_store=*/nullptr, - schema_store()), - StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); -} - -TEST_F(ScorerTest, CreationWithNullSchemaStoreShouldFail) { - EXPECT_THAT( - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE), - /*default_score=*/0, document_store(), - /*schema_store=*/nullptr), - StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); +TEST_F(ScorerTest, CreationWithNullPointerShouldFail) { + EXPECT_THAT(Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE, + /*default_score=*/0, /*document_store=*/nullptr), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); } TEST_F(ScorerTest, ShouldGetDefaultScoreIfDocumentDoesntExist) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE), - /*default_score=*/10, document_store(), schema_store())); + Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE, + /*default_score=*/10, document_store())); // Non existent document id DocHitInfo docHitInfo = DocHitInfo(/*document_id_in=*/1); @@ -176,9 +153,8 @@ TEST_F(ScorerTest, ShouldGetDefaultScoreIfDocumentIsDeleted) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE), - /*default_score=*/10, document_store(), schema_store())); + Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE, + /*default_score=*/10, document_store())); DocHitInfo docHitInfo = DocHitInfo(document_id); @@ -209,9 +185,8 @@ TEST_F(ScorerTest, ShouldGetDefaultScoreIfDocumentIsExpired) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE), - /*default_score=*/10, document_store(), schema_store())); + Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE, + /*default_score=*/10, document_store())); DocHitInfo docHitInfo = DocHitInfo(document_id); @@ -238,9 +213,8 @@ TEST_F(ScorerTest, ShouldGetDefaultDocumentScore) { document_store()->Put(test_document)); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE), - /*default_score=*/10, document_store(), schema_store())); + Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE, + /*default_score=*/10, document_store())); DocHitInfo docHitInfo = DocHitInfo(document_id); EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(0)); @@ -261,9 +235,8 @@ TEST_F(ScorerTest, ShouldGetCorrectDocumentScore) { document_store()->Put(test_document)); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE), - /*default_score=*/0, document_store(), schema_store())); + Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE, + /*default_score=*/0, document_store())); DocHitInfo docHitInfo = DocHitInfo(document_id); EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(5)); @@ -286,9 +259,8 @@ TEST_F(ScorerTest, QueryIteratorNullRelevanceScoreShouldReturnDefaultScore) { document_store()->Put(test_document)); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE), - /*default_score=*/10, document_store(), schema_store())); + Scorer::Create(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE, + /*default_score=*/10, document_store())); DocHitInfo docHitInfo = DocHitInfo(document_id); EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(10)); @@ -318,9 +290,8 @@ TEST_F(ScorerTest, ShouldGetCorrectCreationTimestampScore) { document_store()->Put(test_document2)); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP), - /*default_score=*/0, document_store(), schema_store())); + Scorer::Create(ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP, + /*default_score=*/0, document_store())); DocHitInfo docHitInfo1 = DocHitInfo(document_id1); DocHitInfo docHitInfo2 = DocHitInfo(document_id2); @@ -345,19 +316,16 @@ TEST_F(ScorerTest, ShouldGetCorrectUsageCountScoreForType1) { // Create 3 scorers for 3 different usage types. ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer1, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy::USAGE_TYPE1_COUNT), - /*default_score=*/0, document_store(), schema_store())); + Scorer::Create(ScoringSpecProto::RankingStrategy::USAGE_TYPE1_COUNT, + /*default_score=*/0, document_store())); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer2, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy::USAGE_TYPE2_COUNT), - /*default_score=*/0, document_store(), schema_store())); + Scorer::Create(ScoringSpecProto::RankingStrategy::USAGE_TYPE2_COUNT, + /*default_score=*/0, document_store())); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer3, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy::USAGE_TYPE3_COUNT), - /*default_score=*/0, document_store(), schema_store())); + Scorer::Create(ScoringSpecProto::RankingStrategy::USAGE_TYPE3_COUNT, + /*default_score=*/0, document_store())); DocHitInfo docHitInfo = DocHitInfo(document_id); EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0)); EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0)); @@ -389,19 +357,16 @@ TEST_F(ScorerTest, ShouldGetCorrectUsageCountScoreForType2) { // Create 3 scorers for 3 different usage types. ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer1, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy::USAGE_TYPE1_COUNT), - /*default_score=*/0, document_store(), schema_store())); + Scorer::Create(ScoringSpecProto::RankingStrategy::USAGE_TYPE1_COUNT, + /*default_score=*/0, document_store())); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer2, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy::USAGE_TYPE2_COUNT), - /*default_score=*/0, document_store(), schema_store())); + Scorer::Create(ScoringSpecProto::RankingStrategy::USAGE_TYPE2_COUNT, + /*default_score=*/0, document_store())); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer3, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy::USAGE_TYPE3_COUNT), - /*default_score=*/0, document_store(), schema_store())); + Scorer::Create(ScoringSpecProto::RankingStrategy::USAGE_TYPE3_COUNT, + /*default_score=*/0, document_store())); DocHitInfo docHitInfo = DocHitInfo(document_id); EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0)); EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0)); @@ -433,19 +398,16 @@ TEST_F(ScorerTest, ShouldGetCorrectUsageCountScoreForType3) { // Create 3 scorers for 3 different usage types. ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer1, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy::USAGE_TYPE1_COUNT), - /*default_score=*/0, document_store(), schema_store())); + Scorer::Create(ScoringSpecProto::RankingStrategy::USAGE_TYPE1_COUNT, + /*default_score=*/0, document_store())); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer2, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy::USAGE_TYPE2_COUNT), - /*default_score=*/0, document_store(), schema_store())); + Scorer::Create(ScoringSpecProto::RankingStrategy::USAGE_TYPE2_COUNT, + /*default_score=*/0, document_store())); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer3, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy::USAGE_TYPE3_COUNT), - /*default_score=*/0, document_store(), schema_store())); + Scorer::Create(ScoringSpecProto::RankingStrategy::USAGE_TYPE3_COUNT, + /*default_score=*/0, document_store())); DocHitInfo docHitInfo = DocHitInfo(document_id); EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0)); EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0)); @@ -477,22 +439,19 @@ TEST_F(ScorerTest, ShouldGetCorrectUsageTimestampScoreForType1) { // Create 3 scorers for 3 different usage types. ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer1, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy:: - USAGE_TYPE1_LAST_USED_TIMESTAMP), - /*default_score=*/0, document_store(), schema_store())); + Scorer::Create( + ScoringSpecProto::RankingStrategy::USAGE_TYPE1_LAST_USED_TIMESTAMP, + /*default_score=*/0, document_store())); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer2, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy:: - USAGE_TYPE2_LAST_USED_TIMESTAMP), - /*default_score=*/0, document_store(), schema_store())); + Scorer::Create( + ScoringSpecProto::RankingStrategy::USAGE_TYPE2_LAST_USED_TIMESTAMP, + /*default_score=*/0, document_store())); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer3, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy:: - USAGE_TYPE3_LAST_USED_TIMESTAMP), - /*default_score=*/0, document_store(), schema_store())); + Scorer::Create( + ScoringSpecProto::RankingStrategy::USAGE_TYPE3_LAST_USED_TIMESTAMP, + /*default_score=*/0, document_store())); DocHitInfo docHitInfo = DocHitInfo(document_id); EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0)); EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0)); @@ -540,22 +499,19 @@ TEST_F(ScorerTest, ShouldGetCorrectUsageTimestampScoreForType2) { // Create 3 scorers for 3 different usage types. ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer1, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy:: - USAGE_TYPE1_LAST_USED_TIMESTAMP), - /*default_score=*/0, document_store(), schema_store())); + Scorer::Create( + ScoringSpecProto::RankingStrategy::USAGE_TYPE1_LAST_USED_TIMESTAMP, + /*default_score=*/0, document_store())); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer2, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy:: - USAGE_TYPE2_LAST_USED_TIMESTAMP), - /*default_score=*/0, document_store(), schema_store())); + Scorer::Create( + ScoringSpecProto::RankingStrategy::USAGE_TYPE2_LAST_USED_TIMESTAMP, + /*default_score=*/0, document_store())); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer3, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy:: - USAGE_TYPE3_LAST_USED_TIMESTAMP), - /*default_score=*/0, document_store(), schema_store())); + Scorer::Create( + ScoringSpecProto::RankingStrategy::USAGE_TYPE3_LAST_USED_TIMESTAMP, + /*default_score=*/0, document_store())); DocHitInfo docHitInfo = DocHitInfo(document_id); EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0)); EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0)); @@ -603,22 +559,19 @@ TEST_F(ScorerTest, ShouldGetCorrectUsageTimestampScoreForType3) { // Create 3 scorers for 3 different usage types. ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer1, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy:: - USAGE_TYPE1_LAST_USED_TIMESTAMP), - /*default_score=*/0, document_store(), schema_store())); + Scorer::Create( + ScoringSpecProto::RankingStrategy::USAGE_TYPE1_LAST_USED_TIMESTAMP, + /*default_score=*/0, document_store())); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer2, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy:: - USAGE_TYPE2_LAST_USED_TIMESTAMP), - /*default_score=*/0, document_store(), schema_store())); + Scorer::Create( + ScoringSpecProto::RankingStrategy::USAGE_TYPE2_LAST_USED_TIMESTAMP, + /*default_score=*/0, document_store())); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer3, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy:: - USAGE_TYPE3_LAST_USED_TIMESTAMP), - /*default_score=*/0, document_store(), schema_store())); + Scorer::Create( + ScoringSpecProto::RankingStrategy::USAGE_TYPE3_LAST_USED_TIMESTAMP, + /*default_score=*/0, document_store())); DocHitInfo docHitInfo = DocHitInfo(document_id); EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0)); EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0)); @@ -654,9 +607,8 @@ TEST_F(ScorerTest, ShouldGetCorrectUsageTimestampScoreForType3) { TEST_F(ScorerTest, NoScorerShouldAlwaysReturnDefaultScore) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy::NONE), - /*default_score=*/3, document_store(), schema_store())); + Scorer::Create(ScoringSpecProto::RankingStrategy::NONE, + /*default_score=*/3, document_store())); DocHitInfo docHitInfo1 = DocHitInfo(/*document_id_in=*/0); DocHitInfo docHitInfo2 = DocHitInfo(/*document_id_in=*/1); @@ -666,10 +618,8 @@ TEST_F(ScorerTest, NoScorerShouldAlwaysReturnDefaultScore) { EXPECT_THAT(scorer->GetScore(docHitInfo3), Eq(3)); ICING_ASSERT_OK_AND_ASSIGN( - scorer, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy::NONE), - /*default_score=*/111, document_store(), schema_store())); + scorer, Scorer::Create(ScoringSpecProto::RankingStrategy::NONE, + /*default_score=*/111, document_store())); docHitInfo1 = DocHitInfo(/*document_id_in=*/4); docHitInfo2 = DocHitInfo(/*document_id_in=*/5); @@ -693,10 +643,9 @@ TEST_F(ScorerTest, ShouldScaleUsageTimestampScoreForMaxTimestamp) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer1, - Scorer::Create(CreateScoringSpecForRankingStrategy( - ScoringSpecProto::RankingStrategy:: - USAGE_TYPE1_LAST_USED_TIMESTAMP), - /*default_score=*/0, document_store(), schema_store())); + Scorer::Create( + ScoringSpecProto::RankingStrategy::USAGE_TYPE1_LAST_USED_TIMESTAMP, + /*default_score=*/0, document_store())); DocHitInfo docHitInfo = DocHitInfo(document_id); // Create usage report for the maximum allowable timestamp. diff --git a/icing/scoring/scoring-processor.cc b/icing/scoring/scoring-processor.cc index e36f3bb..24480ef 100644 --- a/icing/scoring/scoring-processor.cc +++ b/icing/scoring/scoring-processor.cc @@ -39,20 +39,19 @@ constexpr double kDefaultScoreInAscendingOrder = libtextclassifier3::StatusOr<std::unique_ptr<ScoringProcessor>> ScoringProcessor::Create(const ScoringSpecProto& scoring_spec, - const DocumentStore* document_store, - const SchemaStore* schema_store) { + const DocumentStore* document_store) { ICING_RETURN_ERROR_IF_NULL(document_store); - ICING_RETURN_ERROR_IF_NULL(schema_store); bool is_descending_order = scoring_spec.order_by() == ScoringSpecProto::Order::DESC; ICING_ASSIGN_OR_RETURN( std::unique_ptr<Scorer> scorer, - Scorer::Create(scoring_spec, + Scorer::Create(scoring_spec.rank_by(), is_descending_order ? kDefaultScoreInDescendingOrder : kDefaultScoreInAscendingOrder, - document_store, schema_store)); + document_store)); + // Using `new` to access a non-public constructor. return std::unique_ptr<ScoringProcessor>( new ScoringProcessor(std::move(scorer))); diff --git a/icing/scoring/scoring-processor.h b/icing/scoring/scoring-processor.h index e7d09b1..2289605 100644 --- a/icing/scoring/scoring-processor.h +++ b/icing/scoring/scoring-processor.h @@ -40,8 +40,8 @@ class ScoringProcessor { // A ScoringProcessor on success // FAILED_PRECONDITION on any null pointer input static libtextclassifier3::StatusOr<std::unique_ptr<ScoringProcessor>> Create( - const ScoringSpecProto& scoring_spec, const DocumentStore* document_store, - const SchemaStore* schema_store); + const ScoringSpecProto& scoring_spec, + const DocumentStore* document_store); // Assigns scores to DocHitInfos from the given DocHitInfoIterator and returns // a vector of ScoredDocumentHits. The size of results is no more than diff --git a/icing/scoring/scoring-processor_test.cc b/icing/scoring/scoring-processor_test.cc index b42ba31..125e2a7 100644 --- a/icing/scoring/scoring-processor_test.cc +++ b/icing/scoring/scoring-processor_test.cc @@ -34,16 +34,14 @@ namespace lib { namespace { using ::testing::ElementsAre; -using ::testing::Eq; -using ::testing::Gt; using ::testing::IsEmpty; using ::testing::SizeIs; -constexpr PropertyConfigProto::DataType::Code TYPE_STRING = - PropertyConfigProto::DataType::STRING; +constexpr PropertyConfigProto_DataType_Code TYPE_STRING = + PropertyConfigProto_DataType_Code_STRING; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = + PropertyConfigProto_Cardinality_Code_OPTIONAL; class ScoringProcessorTest : public testing::Test { protected: @@ -60,7 +58,7 @@ class ScoringProcessorTest : public testing::Test { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, @@ -71,24 +69,11 @@ class ScoringProcessorTest : public testing::Test { // Creates a simple email schema SchemaProto test_email_schema = SchemaBuilder() - .AddType(SchemaTypeConfigBuilder() - .SetType("email") - .AddProperty( - PropertyConfigBuilder() - .SetName("subject") - .SetDataTypeString( - TermMatchType::PREFIX, - StringIndexingConfig::TokenizerType::PLAIN) - .SetDataType(TYPE_STRING) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("body") - .SetDataTypeString( - TermMatchType::PREFIX, - StringIndexingConfig::TokenizerType::PLAIN) - .SetDataType(TYPE_STRING) - .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); ICING_ASSERT_OK(schema_store_->SetSchema(test_email_schema)); } @@ -101,8 +86,6 @@ class ScoringProcessorTest : public testing::Test { DocumentStore* document_store() { return document_store_.get(); } - SchemaStore* schema_store() { return schema_store_.get(); } - private: const std::string test_dir_; const std::string doc_store_dir_; @@ -156,46 +139,16 @@ UsageReport CreateUsageReport(std::string name_space, std::string uri, return usage_report; } -TypePropertyWeights CreateTypePropertyWeights( - std::string schema_type, std::vector<PropertyWeight> property_weights) { - TypePropertyWeights type_property_weights; - type_property_weights.set_schema_type(std::move(schema_type)); - type_property_weights.mutable_property_weights()->Reserve( - property_weights.size()); - - for (PropertyWeight& property_weight : property_weights) { - *type_property_weights.add_property_weights() = std::move(property_weight); - } - - return type_property_weights; -} - -PropertyWeight CreatePropertyWeight(std::string path, double weight) { - PropertyWeight property_weight; - property_weight.set_path(std::move(path)); - property_weight.set_weight(weight); - return property_weight; -} - -TEST_F(ScoringProcessorTest, CreationWithNullDocumentStoreShouldFail) { +TEST_F(ScoringProcessorTest, CreationWithNullPointerShouldFail) { ScoringSpecProto spec_proto; - EXPECT_THAT(ScoringProcessor::Create(spec_proto, /*document_store=*/nullptr, - schema_store()), - StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); -} - -TEST_F(ScoringProcessorTest, CreationWithNullSchemaStoreShouldFail) { - ScoringSpecProto spec_proto; - EXPECT_THAT(ScoringProcessor::Create(spec_proto, document_store(), - /*schema_store=*/nullptr), + EXPECT_THAT(ScoringProcessor::Create(spec_proto, /*document_store=*/nullptr), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); } TEST_F(ScoringProcessorTest, ShouldCreateInstance) { ScoringSpecProto spec_proto; spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE); - ICING_EXPECT_OK( - ScoringProcessor::Create(spec_proto, document_store(), schema_store())); + ICING_EXPECT_OK(ScoringProcessor::Create(spec_proto, document_store())); } TEST_F(ScoringProcessorTest, ShouldHandleEmptyDocHitIterator) { @@ -210,7 +163,7 @@ TEST_F(ScoringProcessorTest, ShouldHandleEmptyDocHitIterator) { // Creates a ScoringProcessor ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ScoringProcessor> scoring_processor, - ScoringProcessor::Create(spec_proto, document_store(), schema_store())); + ScoringProcessor::Create(spec_proto, document_store())); EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator), /*num_to_score=*/5), @@ -236,7 +189,7 @@ TEST_F(ScoringProcessorTest, ShouldHandleNonPositiveNumToScore) { // Creates a ScoringProcessor ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ScoringProcessor> scoring_processor, - ScoringProcessor::Create(spec_proto, document_store(), schema_store())); + ScoringProcessor::Create(spec_proto, document_store())); EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator), /*num_to_score=*/-1), @@ -266,7 +219,7 @@ TEST_F(ScoringProcessorTest, ShouldRespectNumToScore) { // Creates a ScoringProcessor ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ScoringProcessor> scoring_processor, - ScoringProcessor::Create(spec_proto, document_store(), schema_store())); + ScoringProcessor::Create(spec_proto, document_store())); EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator), /*num_to_score=*/2), @@ -298,7 +251,7 @@ TEST_F(ScoringProcessorTest, ShouldScoreByDocumentScore) { // Creates a ScoringProcessor ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ScoringProcessor> scoring_processor, - ScoringProcessor::Create(spec_proto, document_store(), schema_store())); + ScoringProcessor::Create(spec_proto, document_store())); EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator), /*num_to_score=*/3), @@ -353,7 +306,7 @@ TEST_F(ScoringProcessorTest, // Creates a ScoringProcessor ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ScoringProcessor> scoring_processor, - ScoringProcessor::Create(spec_proto, document_store(), schema_store())); + ScoringProcessor::Create(spec_proto, document_store())); std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>> query_term_iterators; @@ -363,11 +316,11 @@ TEST_F(ScoringProcessorTest, // the document's length determines the final score. Document shorter than the // average corpus length are slightly boosted. ScoredDocumentHit expected_scored_doc_hit1(document_id1, section_id_mask, - /*score=*/0.187114); + /*score=*/0.255482); ScoredDocumentHit expected_scored_doc_hit2(document_id2, section_id_mask, - /*score=*/0.084904); + /*score=*/0.115927); ScoredDocumentHit expected_scored_doc_hit3(document_id3, section_id_mask, - /*score=*/0.121896); + /*score=*/0.166435); EXPECT_THAT( scoring_processor->Score(std::move(doc_hit_info_iterator), /*num_to_score=*/3, &query_term_iterators), @@ -422,7 +375,7 @@ TEST_F(ScoringProcessorTest, // Creates a ScoringProcessor ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ScoringProcessor> scoring_processor, - ScoringProcessor::Create(spec_proto, document_store(), schema_store())); + ScoringProcessor::Create(spec_proto, document_store())); std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>> query_term_iterators; @@ -431,11 +384,11 @@ TEST_F(ScoringProcessorTest, // Since the three documents all contain the query term "foo" exactly once // and they have the same length, they will have the same BM25F scoret. ScoredDocumentHit expected_scored_doc_hit1(document_id1, section_id_mask, - /*score=*/0.118455); + /*score=*/0.16173716); ScoredDocumentHit expected_scored_doc_hit2(document_id2, section_id_mask, - /*score=*/0.118455); + /*score=*/0.16173716); ScoredDocumentHit expected_scored_doc_hit3(document_id3, section_id_mask, - /*score=*/0.118455); + /*score=*/0.16173716); EXPECT_THAT( scoring_processor->Score(std::move(doc_hit_info_iterator), /*num_to_score=*/3, &query_term_iterators), @@ -495,7 +448,7 @@ TEST_F(ScoringProcessorTest, // Creates a ScoringProcessor ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ScoringProcessor> scoring_processor, - ScoringProcessor::Create(spec_proto, document_store(), schema_store())); + ScoringProcessor::Create(spec_proto, document_store())); std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>> query_term_iterators; @@ -504,11 +457,11 @@ TEST_F(ScoringProcessorTest, // Since the three documents all have the same length, the score is decided by // the frequency of the query term "foo". ScoredDocumentHit expected_scored_doc_hit1(document_id1, section_id_mask1, - /*score=*/0.226674); + /*score=*/0.309497); ScoredDocumentHit expected_scored_doc_hit2(document_id2, section_id_mask2, - /*score=*/0.118455); + /*score=*/0.16173716); ScoredDocumentHit expected_scored_doc_hit3(document_id3, section_id_mask3, - /*score=*/0.196720); + /*score=*/0.268599); EXPECT_THAT( scoring_processor->Score(std::move(doc_hit_info_iterator), /*num_to_score=*/3, &query_term_iterators), @@ -517,351 +470,6 @@ TEST_F(ScoringProcessorTest, EqualsScoredDocumentHit(expected_scored_doc_hit3))); } -TEST_F(ScoringProcessorTest, - ShouldScoreByRelevanceScore_HitTermWithZeroFrequency) { - DocumentProto document1 = - CreateDocument("icing", "email/1", kDefaultScore, - /*creation_timestamp_ms=*/kDefaultCreationTimestampMs); - - ICING_ASSERT_OK_AND_ASSIGN( - DocumentId document_id1, - document_store()->Put(document1, /*num_tokens=*/10)); - - // Document 1 contains the term "foo" 0 times in the "subject" property - DocHitInfo doc_hit_info1(document_id1); - doc_hit_info1.UpdateSection(/*section_id*/ 0, /*hit_term_frequency=*/0); - - // Creates input doc_hit_infos and expected output scored_document_hits - std::vector<DocHitInfo> doc_hit_infos = {doc_hit_info1}; - - // Creates a dummy DocHitInfoIterator with 1 result for the query "foo" - std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator = - std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo"); - - ScoringSpecProto spec_proto; - spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE); - - // Creates a ScoringProcessor - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<ScoringProcessor> scoring_processor, - ScoringProcessor::Create(spec_proto, document_store(), schema_store())); - - std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>> - query_term_iterators; - query_term_iterators["foo"] = - std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo"); - - SectionIdMask section_id_mask1 = 0b00000001; - - // Since the document hit has zero frequency, expect a score of zero. - ScoredDocumentHit expected_scored_doc_hit1(document_id1, section_id_mask1, - /*score=*/0.000000); - EXPECT_THAT( - scoring_processor->Score(std::move(doc_hit_info_iterator), - /*num_to_score=*/1, &query_term_iterators), - ElementsAre(EqualsScoredDocumentHit(expected_scored_doc_hit1))); -} - -TEST_F(ScoringProcessorTest, - ShouldScoreByRelevanceScore_SameHitFrequencyDifferentPropertyWeights) { - DocumentProto document1 = - CreateDocument("icing", "email/1", kDefaultScore, - /*creation_timestamp_ms=*/kDefaultCreationTimestampMs); - DocumentProto document2 = - CreateDocument("icing", "email/2", kDefaultScore, - /*creation_timestamp_ms=*/kDefaultCreationTimestampMs); - - ICING_ASSERT_OK_AND_ASSIGN( - DocumentId document_id1, - document_store()->Put(document1, /*num_tokens=*/1)); - ICING_ASSERT_OK_AND_ASSIGN( - DocumentId document_id2, - document_store()->Put(document2, /*num_tokens=*/1)); - - // Document 1 contains the term "foo" 1 time in the "body" property - SectionId body_section_id = 0; - DocHitInfo doc_hit_info1(document_id1); - doc_hit_info1.UpdateSection(body_section_id, /*hit_term_frequency=*/1); - - // Document 2 contains the term "foo" 1 time in the "subject" property - SectionId subject_section_id = 1; - DocHitInfo doc_hit_info2(document_id2); - doc_hit_info2.UpdateSection(subject_section_id, /*hit_term_frequency=*/1); - - // Creates input doc_hit_infos and expected output scored_document_hits - std::vector<DocHitInfo> doc_hit_infos = {doc_hit_info1, doc_hit_info2}; - - // Creates a dummy DocHitInfoIterator with 2 results for the query "foo" - std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator = - std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo"); - - ScoringSpecProto spec_proto; - spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE); - - PropertyWeight body_property_weight = - CreatePropertyWeight(/*path=*/"body", /*weight=*/0.5); - PropertyWeight subject_property_weight = - CreatePropertyWeight(/*path=*/"subject", /*weight=*/2.0); - *spec_proto.add_type_property_weights() = CreateTypePropertyWeights( - /*schema_type=*/"email", {body_property_weight, subject_property_weight}); - - // Creates a ScoringProcessor - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<ScoringProcessor> scoring_processor, - ScoringProcessor::Create(spec_proto, document_store(), schema_store())); - - std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>> - query_term_iterators; - query_term_iterators["foo"] = - std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo"); - - SectionIdMask body_section_id_mask = 1U << body_section_id; - SectionIdMask subject_section_id_mask = 1U << subject_section_id; - - // We expect document 2 to have a higher score than document 1 as it matches - // "foo" in the "subject" property, which is weighed higher than the "body" - // property. Final scores are computed with smoothing applied. - ScoredDocumentHit expected_scored_doc_hit1(document_id1, body_section_id_mask, - /*score=*/0.053624); - ScoredDocumentHit expected_scored_doc_hit2(document_id2, - subject_section_id_mask, - /*score=*/0.153094); - EXPECT_THAT( - scoring_processor->Score(std::move(doc_hit_info_iterator), - /*num_to_score=*/2, &query_term_iterators), - ElementsAre(EqualsScoredDocumentHit(expected_scored_doc_hit1), - EqualsScoredDocumentHit(expected_scored_doc_hit2))); -} - -TEST_F(ScoringProcessorTest, - ShouldScoreByRelevanceScore_WithImplicitPropertyWeight) { - DocumentProto document1 = - CreateDocument("icing", "email/1", kDefaultScore, - /*creation_timestamp_ms=*/kDefaultCreationTimestampMs); - DocumentProto document2 = - CreateDocument("icing", "email/2", kDefaultScore, - /*creation_timestamp_ms=*/kDefaultCreationTimestampMs); - - ICING_ASSERT_OK_AND_ASSIGN( - DocumentId document_id1, - document_store()->Put(document1, /*num_tokens=*/1)); - ICING_ASSERT_OK_AND_ASSIGN( - DocumentId document_id2, - document_store()->Put(document2, /*num_tokens=*/1)); - - // Document 1 contains the term "foo" 1 time in the "body" property - SectionId body_section_id = 0; - DocHitInfo doc_hit_info1(document_id1); - doc_hit_info1.UpdateSection(body_section_id, /*hit_term_frequency=*/1); - - // Document 2 contains the term "foo" 1 time in the "subject" property - SectionId subject_section_id = 1; - DocHitInfo doc_hit_info2(document_id2); - doc_hit_info2.UpdateSection(subject_section_id, /*hit_term_frequency=*/1); - - // Creates input doc_hit_infos and expected output scored_document_hits - std::vector<DocHitInfo> doc_hit_infos = {doc_hit_info1, doc_hit_info2}; - - // Creates a dummy DocHitInfoIterator with 2 results for the query "foo" - std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator = - std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo"); - - ScoringSpecProto spec_proto; - spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE); - - PropertyWeight body_property_weight = - CreatePropertyWeight(/*path=*/"body", /*weight=*/0.5); - *spec_proto.add_type_property_weights() = CreateTypePropertyWeights( - /*schema_type=*/"email", {body_property_weight}); - - // Creates a ScoringProcessor - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<ScoringProcessor> scoring_processor, - ScoringProcessor::Create(spec_proto, document_store(), schema_store())); - - std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>> - query_term_iterators; - query_term_iterators["foo"] = - std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo"); - - SectionIdMask body_section_id_mask = 1U << body_section_id; - SectionIdMask subject_section_id_mask = 1U << subject_section_id; - - // We expect document 2 to have a higher score than document 1 as it matches - // "foo" in the "subject" property, which is weighed higher than the "body" - // property. This is because the "subject" property is implictly given a - // a weight of 1.0, the default weight value. Final scores are computed with - // smoothing applied. - ScoredDocumentHit expected_scored_doc_hit1(document_id1, body_section_id_mask, - /*score=*/0.094601); - ScoredDocumentHit expected_scored_doc_hit2(document_id2, - subject_section_id_mask, - /*score=*/0.153094); - EXPECT_THAT( - scoring_processor->Score(std::move(doc_hit_info_iterator), - /*num_to_score=*/2, &query_term_iterators), - ElementsAre(EqualsScoredDocumentHit(expected_scored_doc_hit1), - EqualsScoredDocumentHit(expected_scored_doc_hit2))); -} - -TEST_F(ScoringProcessorTest, - ShouldScoreByRelevanceScore_WithDefaultPropertyWeight) { - DocumentProto document1 = - CreateDocument("icing", "email/1", kDefaultScore, - /*creation_timestamp_ms=*/kDefaultCreationTimestampMs); - DocumentProto document2 = - CreateDocument("icing", "email/2", kDefaultScore, - /*creation_timestamp_ms=*/kDefaultCreationTimestampMs); - - ICING_ASSERT_OK_AND_ASSIGN( - DocumentId document_id1, - document_store()->Put(document1, /*num_tokens=*/1)); - - // Document 1 contains the term "foo" 1 time in the "body" property - SectionId body_section_id = 0; - DocHitInfo doc_hit_info1(document_id1); - doc_hit_info1.UpdateSection(body_section_id, /*hit_term_frequency=*/1); - - // Creates input doc_hit_infos and expected output scored_document_hits - std::vector<DocHitInfo> doc_hit_infos = {doc_hit_info1}; - - // Creates a dummy DocHitInfoIterator with 1 result for the query "foo" - std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator = - std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo"); - - ScoringSpecProto spec_proto; - spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE); - - *spec_proto.add_type_property_weights() = - CreateTypePropertyWeights(/*schema_type=*/"email", {}); - - // Creates a ScoringProcessor with no explicit weights set. - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<ScoringProcessor> scoring_processor, - ScoringProcessor::Create(spec_proto, document_store(), schema_store())); - - ScoringSpecProto spec_proto_with_weights; - spec_proto_with_weights.set_rank_by( - ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE); - - PropertyWeight body_property_weight = CreatePropertyWeight(/*path=*/"body", - /*weight=*/1.0); - *spec_proto_with_weights.add_type_property_weights() = - CreateTypePropertyWeights(/*schema_type=*/"email", - {body_property_weight}); - - // Creates a ScoringProcessor with default weight set for "body" property. - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<ScoringProcessor> scoring_processor_with_weights, - ScoringProcessor::Create(spec_proto_with_weights, document_store(), - schema_store())); - - std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>> - query_term_iterators; - query_term_iterators["foo"] = - std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo"); - - // Create a doc hit iterator - std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>> - query_term_iterators_scoring_with_weights; - query_term_iterators_scoring_with_weights["foo"] = - std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo"); - - SectionIdMask body_section_id_mask = 1U << body_section_id; - - // We expect document 1 to have the same score whether a weight is explicitly - // set to 1.0 or implictly scored with the default weight. Final scores are - // computed with smoothing applied. - ScoredDocumentHit expected_scored_doc_hit(document_id1, body_section_id_mask, - /*score=*/0.208191); - EXPECT_THAT( - scoring_processor->Score(std::move(doc_hit_info_iterator), - /*num_to_score=*/1, &query_term_iterators), - ElementsAre(EqualsScoredDocumentHit(expected_scored_doc_hit))); - - // Restore ownership of doc hit iterator and query term iterator to test. - doc_hit_info_iterator = - std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo"); - query_term_iterators["foo"] = - std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo"); - - EXPECT_THAT(scoring_processor_with_weights->Score( - std::move(doc_hit_info_iterator), - /*num_to_score=*/1, &query_term_iterators), - ElementsAre(EqualsScoredDocumentHit(expected_scored_doc_hit))); -} - -TEST_F(ScoringProcessorTest, - ShouldScoreByRelevanceScore_WithZeroPropertyWeight) { - DocumentProto document1 = - CreateDocument("icing", "email/1", kDefaultScore, - /*creation_timestamp_ms=*/kDefaultCreationTimestampMs); - DocumentProto document2 = - CreateDocument("icing", "email/2", kDefaultScore, - /*creation_timestamp_ms=*/kDefaultCreationTimestampMs); - - ICING_ASSERT_OK_AND_ASSIGN( - DocumentId document_id1, - document_store()->Put(document1, /*num_tokens=*/1)); - ICING_ASSERT_OK_AND_ASSIGN( - DocumentId document_id2, - document_store()->Put(document2, /*num_tokens=*/1)); - - // Document 1 contains the term "foo" 1 time in the "body" property - SectionId body_section_id = 0; - DocHitInfo doc_hit_info1(document_id1); - doc_hit_info1.UpdateSection(body_section_id, /*hit_term_frequency=*/1); - - // Document 2 contains the term "foo" 1 time in the "subject" property - SectionId subject_section_id = 1; - DocHitInfo doc_hit_info2(document_id2); - doc_hit_info2.UpdateSection(subject_section_id, /*hit_term_frequency=*/1); - - // Creates input doc_hit_infos and expected output scored_document_hits - std::vector<DocHitInfo> doc_hit_infos = {doc_hit_info1, doc_hit_info2}; - - // Creates a dummy DocHitInfoIterator with 2 results for the query "foo" - std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator = - std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo"); - - ScoringSpecProto spec_proto; - spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE); - - // Sets property weight for "body" to 0.0. - PropertyWeight body_property_weight = - CreatePropertyWeight(/*path=*/"body", /*weight=*/0.0); - // Sets property weight for "subject" to 1.0. - PropertyWeight subject_property_weight = - CreatePropertyWeight(/*path=*/"subject", /*weight=*/1.0); - *spec_proto.add_type_property_weights() = CreateTypePropertyWeights( - /*schema_type=*/"email", {body_property_weight, subject_property_weight}); - - // Creates a ScoringProcessor - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<ScoringProcessor> scoring_processor, - ScoringProcessor::Create(spec_proto, document_store(), schema_store())); - - std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>> - query_term_iterators; - query_term_iterators["foo"] = - std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo"); - - std::vector<ScoredDocumentHit> scored_document_hits = - scoring_processor->Score(std::move(doc_hit_info_iterator), - /*num_to_score=*/2, &query_term_iterators); - - // We expect document1 to have a score of 0.0 as the query term "foo" matches - // in the "body" property which has a weight of 0.0. This is a result of the - // weighted term frequency being scaled down to 0.0 for the hit. We expect - // document2 to have a positive score as the query term "foo" matches in the - // "subject" property which has a weight of 1.0. - EXPECT_THAT(scored_document_hits, SizeIs(2)); - EXPECT_THAT(scored_document_hits.at(0).document_id(), Eq(document_id1)); - EXPECT_THAT(scored_document_hits.at(0).score(), Eq(0.0)); - EXPECT_THAT(scored_document_hits.at(1).document_id(), Eq(document_id2)); - EXPECT_THAT(scored_document_hits.at(1).score(), Gt(0.0)); -} - TEST_F(ScoringProcessorTest, ShouldScoreByCreationTimestamp) { DocumentProto document1 = CreateDocument("icing", "email/1", kDefaultScore, @@ -901,7 +509,7 @@ TEST_F(ScoringProcessorTest, ShouldScoreByCreationTimestamp) { // Creates a ScoringProcessor which ranks in descending order ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ScoringProcessor> scoring_processor, - ScoringProcessor::Create(spec_proto, document_store(), schema_store())); + ScoringProcessor::Create(spec_proto, document_store())); EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator), /*num_to_score=*/3), @@ -961,7 +569,7 @@ TEST_F(ScoringProcessorTest, ShouldScoreByUsageCount) { // Creates a ScoringProcessor which ranks in descending order ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ScoringProcessor> scoring_processor, - ScoringProcessor::Create(spec_proto, document_store(), schema_store())); + ScoringProcessor::Create(spec_proto, document_store())); EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator), /*num_to_score=*/3), @@ -1021,7 +629,7 @@ TEST_F(ScoringProcessorTest, ShouldScoreByUsageTimestamp) { // Creates a ScoringProcessor which ranks in descending order ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ScoringProcessor> scoring_processor, - ScoringProcessor::Create(spec_proto, document_store(), schema_store())); + ScoringProcessor::Create(spec_proto, document_store())); EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator), /*num_to_score=*/3), @@ -1057,7 +665,7 @@ TEST_F(ScoringProcessorTest, ShouldHandleNoScores) { // Creates a ScoringProcessor which ranks in descending order ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ScoringProcessor> scoring_processor, - ScoringProcessor::Create(spec_proto, document_store(), schema_store())); + ScoringProcessor::Create(spec_proto, document_store())); EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator), /*num_to_score=*/4), ElementsAre(EqualsScoredDocumentHit(scored_document_hit_default), @@ -1106,7 +714,7 @@ TEST_F(ScoringProcessorTest, ShouldWrapResultsWhenNoScoring) { // Creates a ScoringProcessor which ranks in descending order ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ScoringProcessor> scoring_processor, - ScoringProcessor::Create(spec_proto, document_store(), schema_store())); + ScoringProcessor::Create(spec_proto, document_store())); EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator), /*num_to_score=*/3), diff --git a/icing/scoring/section-weights.cc b/icing/scoring/section-weights.cc deleted file mode 100644 index ed7cd5e..0000000 --- a/icing/scoring/section-weights.cc +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright (C) 2021 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "icing/scoring/section-weights.h" - -#include <cfloat> -#include <unordered_map> -#include <utility> - -#include "icing/proto/scoring.pb.h" -#include "icing/schema/section.h" -#include "icing/util/logging.h" - -namespace icing { -namespace lib { - -namespace { - -// Normalizes all weights in the map to be in range [0.0, 1.0], where the max -// weight is normalized to 1.0. In the case that all weights are equal to 0.0, -// the normalized weight for each will be 0.0. -inline void NormalizeSectionWeights( - double max_weight, std::unordered_map<SectionId, double>& section_weights) { - if (max_weight == 0.0) { - return; - } - for (auto& raw_weight : section_weights) { - raw_weight.second = raw_weight.second / max_weight; - } -} -} // namespace - -libtextclassifier3::StatusOr<std::unique_ptr<SectionWeights>> -SectionWeights::Create(const SchemaStore* schema_store, - const ScoringSpecProto& scoring_spec) { - ICING_RETURN_ERROR_IF_NULL(schema_store); - - std::unordered_map<SchemaTypeId, NormalizedSectionWeights> - schema_property_weight_map; - for (const TypePropertyWeights& type_property_weights : - scoring_spec.type_property_weights()) { - std::string_view schema_type = type_property_weights.schema_type(); - auto schema_type_id_or = schema_store->GetSchemaTypeId(schema_type); - if (!schema_type_id_or.ok()) { - ICING_LOG(WARNING) << "No schema type id found for schema type: " - << schema_type; - continue; - } - SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie(); - auto section_metadata_list_or = - schema_store->GetSectionMetadata(schema_type.data()); - if (!section_metadata_list_or.ok()) { - ICING_LOG(WARNING) << "No metadata found for schema type: " - << schema_type; - continue; - } - - const std::vector<SectionMetadata>* metadata_list = - section_metadata_list_or.ValueOrDie(); - - std::unordered_map<std::string, double> property_paths_weights; - for (const PropertyWeight& property_weight : - type_property_weights.property_weights()) { - double property_path_weight = property_weight.weight(); - - // Return error on negative weights. - if (property_path_weight < 0.0) { - return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( - "Property weight for property path \"%s\" is negative. Negative " - "weights are invalid.", - property_weight.path().c_str())); - } - property_paths_weights.insert( - {property_weight.path(), property_path_weight}); - } - NormalizedSectionWeights normalized_section_weights = - ExtractNormalizedSectionWeights(property_paths_weights, *metadata_list); - - schema_property_weight_map.insert( - {schema_type_id, - {/*section_weights*/ std::move( - normalized_section_weights.section_weights), - /*default_weight*/ normalized_section_weights.default_weight}}); - } - // Using `new` to access a non-public constructor. - return std::unique_ptr<SectionWeights>( - new SectionWeights(std::move(schema_property_weight_map))); -} - -double SectionWeights::GetNormalizedSectionWeight(SchemaTypeId schema_type_id, - SectionId section_id) const { - auto schema_type_map = schema_section_weight_map_.find(schema_type_id); - if (schema_type_map == schema_section_weight_map_.end()) { - // Return default weight if the schema type has no weights specified. - return kDefaultSectionWeight; - } - - auto section_weight = - schema_type_map->second.section_weights.find(section_id); - if (section_weight == schema_type_map->second.section_weights.end()) { - // If there is no entry for SectionId, the weight is implicitly the - // normalized default weight. - return schema_type_map->second.default_weight; - } - return section_weight->second; -} - -inline SectionWeights::NormalizedSectionWeights -SectionWeights::ExtractNormalizedSectionWeights( - const std::unordered_map<std::string, double>& raw_weights, - const std::vector<SectionMetadata>& metadata_list) { - double max_weight = -std::numeric_limits<double>::infinity(); - std::unordered_map<SectionId, double> section_weights; - for (const SectionMetadata& section_metadata : metadata_list) { - std::string_view metadata_path = section_metadata.path; - double section_weight = kDefaultSectionWeight; - auto iter = raw_weights.find(metadata_path.data()); - if (iter != raw_weights.end()) { - section_weight = iter->second; - section_weights.insert({section_metadata.id, section_weight}); - } - // Replace max if we see new max weight. - max_weight = std::max(max_weight, section_weight); - } - - NormalizeSectionWeights(max_weight, section_weights); - // Set normalized default weight to 1.0 in case there is no section - // metadata and max_weight is -INF (we should not see this case). - double normalized_default_weight = - max_weight == -std::numeric_limits<double>::infinity() - ? kDefaultSectionWeight - : kDefaultSectionWeight / max_weight; - SectionWeights::NormalizedSectionWeights normalized_section_weights = - SectionWeights::NormalizedSectionWeights(); - normalized_section_weights.section_weights = std::move(section_weights); - normalized_section_weights.default_weight = normalized_default_weight; - return normalized_section_weights; -} -} // namespace lib -} // namespace icing diff --git a/icing/scoring/section-weights.h b/icing/scoring/section-weights.h deleted file mode 100644 index 23a9188..0000000 --- a/icing/scoring/section-weights.h +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright (C) 2021 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef ICING_SCORING_SECTION_WEIGHTS_H_ -#define ICING_SCORING_SECTION_WEIGHTS_H_ - -#include <unordered_map> - -#include "icing/text_classifier/lib3/utils/base/statusor.h" -#include "icing/schema/schema-store.h" -#include "icing/store/document-store.h" - -namespace icing { -namespace lib { - -inline constexpr double kDefaultSectionWeight = 1.0; - -// Provides functions for setting and retrieving section weights for schema -// type properties. Section weights are used to promote and demote term matches -// in sections when scoring results. Section weights are provided by property -// path, and can range from (0, DBL_MAX]. The SectionId is matched to the -// property path by going over the schema type's section metadata. Weights that -// correspond to a valid property path are then normalized against the maxmium -// section weight, and put into map for quick access for scorers. By default, -// a section is given a raw, pre-normalized weight of 1.0. -class SectionWeights { - public: - // SectionWeights instances should not be copied. - SectionWeights(const SectionWeights&) = delete; - SectionWeights& operator=(const SectionWeights&) = delete; - - // Factory function to create a SectionWeights instance. Raw weights are - // provided through the ScoringSpecProto. Provided property paths for weights - // are validated against the schema type's section metadata. If the property - // path doesn't exist, the property weight is ignored. If a weight is 0 or - // negative, an invalid argument error is returned. Raw weights are then - // normalized against the maximum weight for that schema type. - // - // Returns: - // A SectionWeights instance on success - // FAILED_PRECONDITION on any null pointer input - // INVALID_ARGUMENT if a provided weight for a property path is less than or - // equal to 0. - static libtextclassifier3::StatusOr<std::unique_ptr<SectionWeights>> Create( - const SchemaStore* schema_store, const ScoringSpecProto& scoring_spec); - - // Returns the normalized section weight by SchemaTypeId and SectionId. If - // the SchemaTypeId, or the SectionId for a SchemaTypeId, is not found in the - // normalized weights map, the default weight is returned instead. - double GetNormalizedSectionWeight(SchemaTypeId schema_type_id, - SectionId section_id) const; - - private: - // Holds the normalized section weights for a schema type, as well as the - // normalized default weight for sections that have no weight set. - struct NormalizedSectionWeights { - std::unordered_map<SectionId, double> section_weights; - double default_weight; - }; - - explicit SectionWeights( - const std::unordered_map<SchemaTypeId, NormalizedSectionWeights> - schema_section_weight_map) - : schema_section_weight_map_(std::move(schema_section_weight_map)) {} - - // Creates a map of section ids to normalized weights from the raw property - // path weight map and section metadata and calculates the normalized default - // section weight. - static inline SectionWeights::NormalizedSectionWeights - ExtractNormalizedSectionWeights( - const std::unordered_map<std::string, double>& raw_weights, - const std::vector<SectionMetadata>& metadata_list); - - // A map of (SchemaTypeId -> SectionId -> Normalized Weight), allows for fast - // look up of normalized weights. This is precomputed when creating a - // SectionWeights instance. - std::unordered_map<SchemaTypeId, NormalizedSectionWeights> - schema_section_weight_map_; -}; - -} // namespace lib -} // namespace icing - -#endif // ICING_SCORING_SECTION_WEIGHTS_H_ diff --git a/icing/scoring/section-weights_test.cc b/icing/scoring/section-weights_test.cc deleted file mode 100644 index 330faee..0000000 --- a/icing/scoring/section-weights_test.cc +++ /dev/null @@ -1,443 +0,0 @@ -// Copyright (C) 2021 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "icing/scoring/section-weights.h" - -#include <cfloat> - -#include "gmock/gmock.h" -#include "gtest/gtest.h" -#include "icing/proto/scoring.pb.h" -#include "icing/schema-builder.h" -#include "icing/testing/common-matchers.h" -#include "icing/testing/fake-clock.h" -#include "icing/testing/tmp-directory.h" - -namespace icing { -namespace lib { - -namespace { -using ::testing::Eq; - -class SectionWeightsTest : public testing::Test { - protected: - SectionWeightsTest() - : test_dir_(GetTestTempDir() + "/icing"), - schema_store_dir_(test_dir_ + "/schema_store") {} - - void SetUp() override { - // Creates file directories - filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); - filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()); - - ICING_ASSERT_OK_AND_ASSIGN( - schema_store_, - SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); - - SchemaTypeConfigProto sender_schema = - SchemaTypeConfigBuilder() - .SetType("sender") - .AddProperty( - PropertyConfigBuilder() - .SetName("name") - .SetDataTypeString( - TermMatchType::PREFIX, - StringIndexingConfig::TokenizerType::PLAIN) - .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL)) - .Build(); - SchemaTypeConfigProto email_schema = - SchemaTypeConfigBuilder() - .SetType("email") - .AddProperty( - PropertyConfigBuilder() - .SetName("subject") - .SetDataTypeString( - TermMatchType::PREFIX, - StringIndexingConfig::TokenizerType::PLAIN) - .SetDataType(PropertyConfigProto::DataType::STRING) - .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("body") - .SetDataTypeString( - TermMatchType::PREFIX, - StringIndexingConfig::TokenizerType::PLAIN) - .SetDataType(PropertyConfigProto::DataType::STRING) - .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("sender") - .SetDataTypeDocument("sender", - /*index_nested_properties=*/true) - .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL)) - .Build(); - SchemaProto schema = - SchemaBuilder().AddType(sender_schema).AddType(email_schema).Build(); - - ICING_ASSERT_OK(schema_store_->SetSchema(schema)); - } - - void TearDown() override { - schema_store_.reset(); - filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); - } - - SchemaStore *schema_store() { return schema_store_.get(); } - - private: - const std::string test_dir_; - const std::string schema_store_dir_; - Filesystem filesystem_; - FakeClock fake_clock_; - std::unique_ptr<SchemaStore> schema_store_; -}; - -TEST_F(SectionWeightsTest, ShouldNormalizeSinglePropertyWeight) { - ScoringSpecProto spec_proto; - - TypePropertyWeights *type_property_weights = - spec_proto.add_type_property_weights(); - type_property_weights->set_schema_type("sender"); - - PropertyWeight *property_weight = - type_property_weights->add_property_weights(); - property_weight->set_weight(5.0); - property_weight->set_path("name"); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SectionWeights> section_weights, - SectionWeights::Create(schema_store(), spec_proto)); - ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId sender_schema_type_id, - schema_store()->GetSchemaTypeId("sender")); - - // section_id 0 corresponds to property "name". - // We expect 1.0 as there is only one property in the "sender" schema type - // so it should take the max normalized weight of 1.0. - EXPECT_THAT(section_weights->GetNormalizedSectionWeight(sender_schema_type_id, - /*section_id=*/0), - Eq(1.0)); -} - -TEST_F(SectionWeightsTest, ShouldAcceptMaxWeightValue) { - ScoringSpecProto spec_proto; - - TypePropertyWeights *type_property_weights = - spec_proto.add_type_property_weights(); - type_property_weights->set_schema_type("sender"); - - PropertyWeight *property_weight = - type_property_weights->add_property_weights(); - property_weight->set_weight(DBL_MAX); - property_weight->set_path("name"); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SectionWeights> section_weights, - SectionWeights::Create(schema_store(), spec_proto)); - ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId sender_schema_type_id, - schema_store()->GetSchemaTypeId("sender")); - - // section_id 0 corresponds to property "name". - EXPECT_THAT(section_weights->GetNormalizedSectionWeight(sender_schema_type_id, - /*section_id=*/0), - Eq(1.0)); -} - -TEST_F(SectionWeightsTest, ShouldFailWithNegativeWeights) { - ScoringSpecProto spec_proto; - - TypePropertyWeights *type_property_weights = - spec_proto.add_type_property_weights(); - type_property_weights->set_schema_type("email"); - - PropertyWeight *body_propery_weight = - type_property_weights->add_property_weights(); - body_propery_weight->set_weight(-100.0); - body_propery_weight->set_path("body"); - - EXPECT_THAT(SectionWeights::Create(schema_store(), spec_proto).status(), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); -} - -TEST_F(SectionWeightsTest, ShouldAcceptZeroWeight) { - ScoringSpecProto spec_proto; - - TypePropertyWeights *type_property_weights = - spec_proto.add_type_property_weights(); - type_property_weights->set_schema_type("email"); - - PropertyWeight *body_property_weight = - type_property_weights->add_property_weights(); - body_property_weight->set_weight(2.0); - body_property_weight->set_path("body"); - - PropertyWeight *subject_property_weight = - type_property_weights->add_property_weights(); - subject_property_weight->set_weight(0.0); - subject_property_weight->set_path("subject"); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SectionWeights> section_weights, - SectionWeights::Create(schema_store(), spec_proto)); - ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id, - schema_store()->GetSchemaTypeId("email")); - - // Normalized weight for "body" property. - EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id, - /*section_id=*/0), - Eq(1.0)); - // Normalized weight for "subject" property. - EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id, - /*section_id=*/2), - Eq(0.0)); -} - -TEST_F(SectionWeightsTest, ShouldNormalizeToZeroWhenAllWeightsZero) { - ScoringSpecProto spec_proto; - - TypePropertyWeights *type_property_weights = - spec_proto.add_type_property_weights(); - type_property_weights->set_schema_type("email"); - - PropertyWeight *body_property_weight = - type_property_weights->add_property_weights(); - body_property_weight->set_weight(0.0); - body_property_weight->set_path("body"); - - PropertyWeight *sender_property_weight = - type_property_weights->add_property_weights(); - sender_property_weight->set_weight(0.0); - sender_property_weight->set_path("sender.name"); - - PropertyWeight *subject_property_weight = - type_property_weights->add_property_weights(); - subject_property_weight->set_weight(0.0); - subject_property_weight->set_path("subject"); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SectionWeights> section_weights, - SectionWeights::Create(schema_store(), spec_proto)); - ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id, - schema_store()->GetSchemaTypeId("email")); - - // Normalized weight for "body" property. - EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id, - /*section_id=*/0), - Eq(0.0)); - // Normalized weight for "sender.name" property (the nested property). - EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id, - /*section_id=*/1), - Eq(0.0)); - // Normalized weight for "subject" property. - EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id, - /*section_id=*/2), - Eq(0.0)); -} - -TEST_F(SectionWeightsTest, ShouldReturnDefaultIfTypePropertyWeightsNotSet) { - ScoringSpecProto spec_proto; - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SectionWeights> section_weights, - SectionWeights::Create(schema_store(), spec_proto)); - ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id, - schema_store()->GetSchemaTypeId("email")); - - EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id, - /*section_id=*/0), - Eq(kDefaultSectionWeight)); -} - -TEST_F(SectionWeightsTest, ShouldSetNestedPropertyWeights) { - ScoringSpecProto spec_proto; - - TypePropertyWeights *type_property_weights = - spec_proto.add_type_property_weights(); - type_property_weights->set_schema_type("email"); - - PropertyWeight *body_property_weight = - type_property_weights->add_property_weights(); - body_property_weight->set_weight(1.0); - body_property_weight->set_path("body"); - - PropertyWeight *subject_property_weight = - type_property_weights->add_property_weights(); - subject_property_weight->set_weight(100.0); - subject_property_weight->set_path("subject"); - - PropertyWeight *nested_property_weight = - type_property_weights->add_property_weights(); - nested_property_weight->set_weight(50.0); - nested_property_weight->set_path("sender.name"); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SectionWeights> section_weights, - SectionWeights::Create(schema_store(), spec_proto)); - ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id, - schema_store()->GetSchemaTypeId("email")); - - // Normalized weight for "body" property. - EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id, - /*section_id=*/0), - Eq(0.01)); - // Normalized weight for "sender.name" property (the nested property). - EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id, - /*section_id=*/1), - Eq(0.5)); - // Normalized weight for "subject" property. - EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id, - /*section_id=*/2), - Eq(1.0)); -} - -TEST_F(SectionWeightsTest, ShouldNormalizeIfAllWeightsBelowOne) { - ScoringSpecProto spec_proto; - - TypePropertyWeights *type_property_weights = - spec_proto.add_type_property_weights(); - type_property_weights->set_schema_type("email"); - - PropertyWeight *body_property_weight = - type_property_weights->add_property_weights(); - body_property_weight->set_weight(0.1); - body_property_weight->set_path("body"); - - PropertyWeight *sender_name_weight = - type_property_weights->add_property_weights(); - sender_name_weight->set_weight(0.2); - sender_name_weight->set_path("sender.name"); - - PropertyWeight *subject_property_weight = - type_property_weights->add_property_weights(); - subject_property_weight->set_weight(0.4); - subject_property_weight->set_path("subject"); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SectionWeights> section_weights, - SectionWeights::Create(schema_store(), spec_proto)); - ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id, - schema_store()->GetSchemaTypeId("email")); - - // Normalized weight for "body" property. - EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id, - /*section_id=*/0), - Eq(1.0 / 4.0)); - // Normalized weight for "sender.name" property (the nested property). - EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id, - /*section_id=*/1), - Eq(2.0 / 4.0)); - // Normalized weight for "subject" property. - EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id, - /*section_id=*/2), - Eq(1.0)); -} - -TEST_F(SectionWeightsTest, ShouldSetNestedPropertyWeightSeparatelyForTypes) { - ScoringSpecProto spec_proto; - - TypePropertyWeights *email_type_property_weights = - spec_proto.add_type_property_weights(); - email_type_property_weights->set_schema_type("email"); - - PropertyWeight *body_property_weight = - email_type_property_weights->add_property_weights(); - body_property_weight->set_weight(1.0); - body_property_weight->set_path("body"); - - PropertyWeight *subject_property_weight = - email_type_property_weights->add_property_weights(); - subject_property_weight->set_weight(100.0); - subject_property_weight->set_path("subject"); - - PropertyWeight *sender_name_property_weight = - email_type_property_weights->add_property_weights(); - sender_name_property_weight->set_weight(50.0); - sender_name_property_weight->set_path("sender.name"); - - TypePropertyWeights *sender_type_property_weights = - spec_proto.add_type_property_weights(); - sender_type_property_weights->set_schema_type("sender"); - - PropertyWeight *sender_property_weight = - sender_type_property_weights->add_property_weights(); - sender_property_weight->set_weight(25.0); - sender_property_weight->set_path("sender"); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SectionWeights> section_weights, - SectionWeights::Create(schema_store(), spec_proto)); - ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id, - schema_store()->GetSchemaTypeId("email")); - ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId sender_schema_type_id, - schema_store()->GetSchemaTypeId("sender")); - - // Normalized weight for "sender.name" property (the nested property) - EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id, - /*section_id=*/1), - Eq(0.5)); - // Normalized weight for "name" property for "sender" schema type. As it is - // the only property of the type, it should take the max normalized weight of - // 1.0. - EXPECT_THAT(section_weights->GetNormalizedSectionWeight(sender_schema_type_id, - /*section_id=*/2), - Eq(1.0)); -} - -TEST_F(SectionWeightsTest, ShouldSkipNonExistentPathWhenSettingWeights) { - ScoringSpecProto spec_proto; - - TypePropertyWeights *type_property_weights = - spec_proto.add_type_property_weights(); - type_property_weights->set_schema_type("email"); - - // If this property weight isn't skipped, then the max property weight would - // be set to 100.0 and all weights would be normalized against the max. - PropertyWeight *non_valid_property_weight = - type_property_weights->add_property_weights(); - non_valid_property_weight->set_weight(100.0); - non_valid_property_weight->set_path("sender.organization"); - - PropertyWeight *subject_property_weight = - type_property_weights->add_property_weights(); - subject_property_weight->set_weight(10.0); - subject_property_weight->set_path("subject"); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SectionWeights> section_weights, - SectionWeights::Create(schema_store(), spec_proto)); - ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id, - schema_store()->GetSchemaTypeId("email")); - - // Normalized weight for "body" property. Because the weight is not explicitly - // set, it is set to the default of 1.0 before being normalized. - EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id, - /*section_id=*/0), - Eq(0.1)); - // Normalized weight for "sender.name" property (the nested property). Because - // the weight is not explicitly set, it is set to the default of 1.0 before - // being normalized. - EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id, - /*section_id=*/1), - Eq(0.1)); - // Normalized weight for "subject" property. Because the invalid property path - // is skipped when assigning weights, subject takes the max normalized weight - // of 1.0 instead. - EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id, - /*section_id=*/2), - Eq(1.0)); -} - -} // namespace - -} // namespace lib -} // namespace icing diff --git a/icing/store/document-log-creator.cc b/icing/store/document-log-creator.cc index 5e23a8e..a035f93 100644 --- a/icing/store/document-log-creator.cc +++ b/icing/store/document-log-creator.cc @@ -69,24 +69,33 @@ DocumentLogCreator::Create(const Filesystem* filesystem, const std::string& base_dir) { bool v0_exists = filesystem->FileExists(MakeDocumentLogFilenameV0(base_dir).c_str()); + bool regen_derived_files = false; + +#ifdef ENABLE_V1_MIGRATION bool v1_exists = filesystem->FileExists(MakeDocumentLogFilenameV1(base_dir).c_str()); - bool new_file = false; - int preexisting_file_version = kCurrentVersion; if (v0_exists && !v1_exists) { ICING_RETURN_IF_ERROR(MigrateFromV0ToV1(filesystem, base_dir)); // Need to regenerate derived files since documents may be written to a // different file offset in the log. - preexisting_file_version = 0; + regen_derived_files = true; } else if (!v1_exists) { // First time initializing a v1 log. There are no existing derived files at // this point, so we should generate some. "regenerate" here also means // "generate for the first time", i.e. we shouldn't expect there to be any // existing derived files. - new_file = true; + regen_derived_files = true; + } +#else // !ENABLE_V1_MIGRATION + if (v0_exists) { + // If migration from v0 to v1 is not enabled, then simply delete the v0 file + // and treat this as if it's our first time initializing a v1 log. + regen_derived_files = true; + filesystem->DeleteFile(MakeDocumentLogFilenameV0(base_dir).c_str()); } +#endif // ENABLED_V1_MIGRATION ICING_ASSIGN_OR_RETURN( PortableFileBackedProtoLog<DocumentWrapper>::CreateResult @@ -97,7 +106,7 @@ DocumentLogCreator::Create(const Filesystem* filesystem, /*compress_in=*/true))); CreateResult create_result = {std::move(log_create_result), - preexisting_file_version, new_file}; + regen_derived_files}; return create_result; } diff --git a/icing/store/document-log-creator.h b/icing/store/document-log-creator.h index be8feed..51cf497 100644 --- a/icing/store/document-log-creator.h +++ b/icing/store/document-log-creator.h @@ -30,20 +30,14 @@ namespace lib { // be necessary. class DocumentLogCreator { public: - // Version 0 refers to FileBackedProtoLog - // Version 1 refers to PortableFileBackedProtoLog with kFileFormatVersion = 0 - static constexpr int32_t kCurrentVersion = 1; struct CreateResult { // The create result passed up from the PortableFileBackedProtoLog::Create. // Contains the document log. PortableFileBackedProtoLog<DocumentWrapper>::CreateResult log_create_result; - // The version number of the pre-existing document log file. - // If there is no document log file, it will be set to kCurrentVersion. - int preexisting_file_version; - - // Whether the created file is new. - bool new_file; + // Whether the caller needs to also regenerate/generate any derived files + // based off of the initialized document log. + bool regen_derived_files; }; // Creates the document log in the base_dir. Will create one if it doesn't diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc index 8c8369c..226a96b 100644 --- a/icing/store/document-store.cc +++ b/icing/store/document-store.cc @@ -164,32 +164,6 @@ int64_t CalculateExpirationTimestampMs(int64_t creation_timestamp_ms, return expiration_timestamp_ms; } -InitializeStatsProto::RecoveryCause GetRecoveryCause( - const DocumentLogCreator::CreateResult& create_result, - bool force_recovery_and_revalidate_documents) { - if (force_recovery_and_revalidate_documents) { - return InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC; - } else if (create_result.log_create_result.has_data_loss()) { - return InitializeStatsProto::DATA_LOSS; - } else if (create_result.preexisting_file_version != - DocumentLogCreator::kCurrentVersion) { - return InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT; - } - return InitializeStatsProto::NONE; -} - -InitializeStatsProto::DocumentStoreDataStatus GetDataStatus( - DataLoss data_loss) { - switch (data_loss) { - case DataLoss::PARTIAL: - return InitializeStatsProto::PARTIAL_LOSS; - case DataLoss::COMPLETE: - return InitializeStatsProto::COMPLETE_LOSS; - case DataLoss::NONE: - return InitializeStatsProto::NO_DATA_LOSS; - } -} - } // namespace DocumentStore::DocumentStore(const Filesystem* filesystem, @@ -262,34 +236,44 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize( std::move(create_result_or).ValueOrDie(); document_log_ = std::move(create_result.log_create_result.proto_log); - InitializeStatsProto::RecoveryCause recovery_cause = - GetRecoveryCause(create_result, force_recovery_and_revalidate_documents); - - if (recovery_cause != InitializeStatsProto::NONE || create_result.new_file) { - ICING_LOG(WARNING) << "Starting Document Store Recovery with cause=" - << recovery_cause << ", and create result { new_file=" - << create_result.new_file << ", preeisting_file_version=" - << create_result.preexisting_file_version << ", data_loss=" - << create_result.log_create_result.data_loss << "} and kCurrentVersion=" - << DocumentLogCreator::kCurrentVersion; + + if (create_result.regen_derived_files || + force_recovery_and_revalidate_documents || + create_result.log_create_result.has_data_loss()) { // We can't rely on any existing derived files. Recreate them from scratch. // Currently happens if: // 1) This is a new log and we don't have derived files yet // 2) Client wanted us to force a regeneration. // 3) Log has some data loss, can't rely on existing derived data. + if (create_result.log_create_result.has_data_loss() && + initialize_stats != nullptr) { + ICING_LOG(WARNING) + << "Data loss in document log, regenerating derived files."; + initialize_stats->set_document_store_recovery_cause( + InitializeStatsProto::DATA_LOSS); + + if (create_result.log_create_result.data_loss == DataLoss::PARTIAL) { + // Ground truth is partially lost. + initialize_stats->set_document_store_data_status( + InitializeStatsProto::PARTIAL_LOSS); + } else { + // Ground truth is completely lost. + initialize_stats->set_document_store_data_status( + InitializeStatsProto::COMPLETE_LOSS); + } + } + std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer(); libtextclassifier3::Status status = RegenerateDerivedFiles(force_recovery_and_revalidate_documents); if (initialize_stats != nullptr && - recovery_cause != InitializeStatsProto::NONE) { + (force_recovery_and_revalidate_documents || + create_result.log_create_result.has_data_loss())) { // Only consider it a recovery if the client forced a recovery or there // was data loss. Otherwise, this could just be the first time we're // initializing and generating derived files. initialize_stats->set_document_store_recovery_latency_ms( document_recovery_timer->GetElapsedMilliseconds()); - initialize_stats->set_document_store_recovery_cause(recovery_cause); - initialize_stats->set_document_store_data_status( - GetDataStatus(create_result.log_create_result.data_loss)); } if (!status.ok()) { ICING_LOG(ERROR) @@ -298,13 +282,13 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize( } } else { if (!InitializeExistingDerivedFiles().ok()) { - ICING_LOG(WARNING) + ICING_VLOG(1) << "Couldn't find derived files or failed to initialize them, " "regenerating derived files for DocumentStore."; std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer(); libtextclassifier3::Status status = RegenerateDerivedFiles( - /*force_recovery_and_revalidate_documents=*/false); - if (initialize_stats != nullptr) { + /*force_recovery_and_revalidate_documents*/ false); + if (initialize_stats != nullptr && num_documents() > 0) { initialize_stats->set_document_store_recovery_cause( InitializeStatsProto::IO_ERROR); initialize_stats->set_document_store_recovery_latency_ms( @@ -431,19 +415,7 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles( // Iterates through document log auto iterator = document_log_->GetIterator(); auto iterator_status = iterator.Advance(); - libtextclassifier3::StatusOr<int64_t> element_size = - document_log_->GetElementsFileSize(); - libtextclassifier3::StatusOr<int64_t> disk_usage = - document_log_->GetDiskUsage(); - if (element_size.ok() && disk_usage.ok()) { - ICING_VLOG(1) << "Starting recovery of document store. Document store " - "elements file size:" - << element_size.ValueOrDie() - << ", disk usage=" << disk_usage.ValueOrDie(); - } while (iterator_status.ok()) { - ICING_VLOG(2) << "Attempting to read document at offset=" - << iterator.GetOffset(); libtextclassifier3::StatusOr<DocumentWrapper> document_wrapper_or = document_log_->ReadProto(iterator.GetOffset()); @@ -558,7 +530,7 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles( libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() { // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset(). document_key_mapper_.reset(); - // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = KeyMapper<DocumentId>::Delete(*filesystem_, base_dir_); @@ -568,7 +540,7 @@ libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() { return status; } - // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN + // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN // that can support error logging. auto document_key_mapper_or = KeyMapper<DocumentId>::Create(*filesystem_, base_dir_, kUriMapperMaxSize); @@ -584,7 +556,7 @@ libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() { libtextclassifier3::Status DocumentStore::ResetDocumentIdMapper() { // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset(). document_id_mapper_.reset(); - // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = FileBackedVector<int64_t>::Delete( *filesystem_, MakeDocumentIdMapperFilename(base_dir_)); @@ -593,7 +565,7 @@ libtextclassifier3::Status DocumentStore::ResetDocumentIdMapper() { << "Failed to delete old document_id mapper"; return status; } - // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN + // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN // that can support error logging. auto document_id_mapper_or = FileBackedVector<int64_t>::Create( *filesystem_, MakeDocumentIdMapperFilename(base_dir_), @@ -646,7 +618,7 @@ libtextclassifier3::Status DocumentStore::ResetFilterCache() { libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() { // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset(). namespace_mapper_.reset(); - // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = KeyMapper<NamespaceId>::Delete( *filesystem_, MakeNamespaceMapperFilename(base_dir_)); @@ -666,7 +638,7 @@ libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() { libtextclassifier3::Status DocumentStore::ResetCorpusMapper() { // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset(). corpus_mapper_.reset(); - // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = KeyMapper<CorpusId>::Delete( *filesystem_, MakeCorpusMapperFilename(base_dir_)); @@ -1777,63 +1749,5 @@ libtextclassifier3::Status DocumentStore::SetUsageScores( return usage_store_->SetUsageScores(document_id, usage_scores); } -libtextclassifier3::StatusOr< - google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>> -DocumentStore::CollectCorpusInfo() const { - google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo> - corpus_info; - libtextclassifier3::StatusOr<const SchemaProto*> schema_proto_or = - schema_store_->GetSchema(); - if (!schema_proto_or.ok()) { - return corpus_info; - } - // Maps from CorpusId to the corresponding protocol buffer in the result. - std::unordered_map<CorpusId, DocumentDebugInfoProto::CorpusInfo*> info_map; - std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace = - namespace_mapper_->GetValuesToKeys(); - const SchemaProto* schema_proto = schema_proto_or.ValueOrDie(); - for (DocumentId document_id = 0; document_id < filter_cache_->num_elements(); - ++document_id) { - if (!InternalDoesDocumentExist(document_id)) { - continue; - } - ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data, - filter_cache_->Get(document_id)); - ICING_ASSIGN_OR_RETURN(const DocumentAssociatedScoreData* score_data, - score_cache_->Get(document_id)); - const std::string& name_space = - namespace_id_to_namespace[filter_data->namespace_id()]; - const std::string& schema = - schema_proto->types()[filter_data->schema_type_id()].schema_type(); - auto iter = info_map.find(score_data->corpus_id()); - if (iter == info_map.end()) { - DocumentDebugInfoProto::CorpusInfo* entry = corpus_info.Add(); - entry->set_namespace_(name_space); - entry->set_schema(schema); - iter = info_map.insert({score_data->corpus_id(), entry}).first; - } - iter->second->set_total_documents(iter->second->total_documents() + 1); - iter->second->set_total_token(iter->second->total_token() + - score_data->length_in_tokens()); - } - return corpus_info; -} - -libtextclassifier3::StatusOr<DocumentDebugInfoProto> -DocumentStore::GetDebugInfo(int verbosity) const { - DocumentDebugInfoProto debug_info; - *debug_info.mutable_document_storage_info() = GetStorageInfo(); - ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum()); - debug_info.set_crc(crc.Get()); - if (verbosity > 0) { - ICING_ASSIGN_OR_RETURN(google::protobuf::RepeatedPtrField< - DocumentDebugInfoProto::CorpusInfo> - corpus_info, - CollectCorpusInfo()); - *debug_info.mutable_corpus_info() = std::move(corpus_info); - } - return debug_info; -} - } // namespace lib } // namespace icing diff --git a/icing/store/document-store.h b/icing/store/document-store.h index e6d2e5c..a60aab1 100644 --- a/icing/store/document-store.h +++ b/icing/store/document-store.h @@ -27,7 +27,6 @@ #include "icing/file/file-backed-vector.h" #include "icing/file/filesystem.h" #include "icing/file/portable-file-backed-proto-log.h" -#include "icing/proto/debug.pb.h" #include "icing/proto/document.pb.h" #include "icing/proto/document_wrapper.pb.h" #include "icing/proto/logging.pb.h" @@ -423,17 +422,6 @@ class DocumentStore { // INTERNAL_ERROR on compute error libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const; - // Get debug information for the document store. - // verbosity <= 0, simplest debug information - // verbosity > 0, also return the total number of documents and tokens in each - // (namespace, schema type) pair. - // - // Returns: - // DocumentDebugInfoProto on success - // INTERNAL_ERROR on IO errors, crc compute error - libtextclassifier3::StatusOr<DocumentDebugInfoProto> GetDebugInfo( - int verbosity) const; - private: // Use DocumentStore::Create() to instantiate. DocumentStore(const Filesystem* filesystem, std::string_view base_dir, @@ -509,6 +497,28 @@ class DocumentStore { bool force_recovery_and_revalidate_documents, InitializeStatsProto* initialize_stats); + // Initializes a new DocumentStore and sets up any underlying files. + // + // Returns: + // Data loss status on success, effectively always DataLoss::NONE + // INTERNAL on I/O error + libtextclassifier3::StatusOr<DataLoss> InitializeNewStore( + InitializeStatsProto* initialize_stats); + + // Initializes a DocumentStore over an existing directory of files. + // + // stats will be set if non-null + // + // Returns: + // Data loss status on success + // INTERNAL on I/O error + libtextclassifier3::StatusOr<DataLoss> InitializeExistingStore( + bool force_recovery_and_revalidate_documents, + InitializeStatsProto* initialize_stats); + + libtextclassifier3::StatusOr<DataLoss> MigrateFromV0ToV1( + InitializeStatsProto* initialize_stats); + // Creates sub-components and verifies the integrity of each sub-component. // This assumes that the the underlying files already exist, and will return // an error if it doesn't find what it's expecting. @@ -708,13 +718,6 @@ class DocumentStore { // the document_id_mapper somehow became larger than the filter cache. DocumentStorageInfoProto CalculateDocumentStatusCounts( DocumentStorageInfoProto storage_info) const; - - // Returns: - // - on success, a RepeatedPtrField for CorpusInfo collected. - // - OUT_OF_RANGE, this should never happen. - libtextclassifier3::StatusOr<google::protobuf::RepeatedPtrField< - DocumentDebugInfoProto::CorpusInfo>> - CollectCorpusInfo() const; }; } // namespace lib diff --git a/icing/store/document-store_benchmark.cc b/icing/store/document-store_benchmark.cc index fc3fd9d..ce608fc 100644 --- a/icing/store/document-store_benchmark.cc +++ b/icing/store/document-store_benchmark.cc @@ -32,7 +32,6 @@ #include "icing/document-builder.h" #include "icing/file/filesystem.h" #include "icing/proto/document.pb.h" -#include "icing/proto/persist.pb.h" #include "icing/proto/schema.pb.h" #include "icing/schema-builder.h" #include "icing/schema/schema-store.h" @@ -64,13 +63,13 @@ namespace lib { namespace { -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = + PropertyConfigProto_Cardinality_Code_OPTIONAL; -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = + StringIndexingConfig_TokenizerType_Code_PLAIN; -constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; +constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; class DestructibleDirectory { public: @@ -256,74 +255,6 @@ void BM_Delete(benchmark::State& state) { } BENCHMARK(BM_Delete); -void BM_Create(benchmark::State& state) { - Filesystem filesystem; - Clock clock; - - std::string directory = GetTestTempDir() + "/icing"; - std::string document_store_dir = directory + "/store"; - - std::unique_ptr<SchemaStore> schema_store = - CreateSchemaStore(filesystem, directory, &clock); - - // Create an initial document store and put some data in. - { - DestructibleDirectory ddir(filesystem, directory); - - filesystem.CreateDirectoryRecursively(document_store_dir.data()); - ICING_ASSERT_OK_AND_ASSIGN( - DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem, document_store_dir, &clock, - schema_store.get())); - std::unique_ptr<DocumentStore> document_store = - std::move(create_result.document_store); - - DocumentProto document = CreateDocument("namespace", "uri"); - ICING_ASSERT_OK(document_store->Put(document)); - ICING_ASSERT_OK(document_store->PersistToDisk(PersistType::FULL)); - } - - // Recreating it with some content to checksum over. - DestructibleDirectory ddir(filesystem, directory); - - filesystem.CreateDirectoryRecursively(document_store_dir.data()); - - for (auto s : state) { - benchmark::DoNotOptimize(DocumentStore::Create( - &filesystem, document_store_dir, &clock, schema_store.get())); - } -} -BENCHMARK(BM_Create); - -void BM_ComputeChecksum(benchmark::State& state) { - Filesystem filesystem; - Clock clock; - - std::string directory = GetTestTempDir() + "/icing"; - DestructibleDirectory ddir(filesystem, directory); - - std::string document_store_dir = directory + "/store"; - std::unique_ptr<SchemaStore> schema_store = - CreateSchemaStore(filesystem, directory, &clock); - - filesystem.CreateDirectoryRecursively(document_store_dir.data()); - ICING_ASSERT_OK_AND_ASSIGN( - DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem, document_store_dir, &clock, - schema_store.get())); - std::unique_ptr<DocumentStore> document_store = - std::move(create_result.document_store); - - DocumentProto document = CreateDocument("namespace", "uri"); - ICING_ASSERT_OK(document_store->Put(document)); - ICING_ASSERT_OK(document_store->PersistToDisk(PersistType::LITE)); - - for (auto s : state) { - benchmark::DoNotOptimize(document_store->ComputeChecksum()); - } -} -BENCHMARK(BM_ComputeChecksum); - } // namespace } // namespace lib diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc index a30b4e4..3ed4c4e 100644 --- a/icing/store/document-store_test.cc +++ b/icing/store/document-store_test.cc @@ -29,6 +29,7 @@ #include "icing/file/filesystem.h" #include "icing/file/memory-mapped-file.h" #include "icing/file/mock-filesystem.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/equals-proto.h" #include "icing/portable/platform.h" #include "icing/proto/document.pb.h" @@ -44,7 +45,6 @@ #include "icing/store/namespace-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" -#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" #include "icing/tokenization/language-segmenter-factory.h" @@ -85,16 +85,16 @@ const NamespaceStorageInfoProto& GetNamespaceStorageInfo( return std::move(NamespaceStorageInfoProto()); } -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = + PropertyConfigProto_Cardinality_Code_OPTIONAL; -constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = - StringIndexingConfig::TokenizerType::PLAIN; +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = + StringIndexingConfig_TokenizerType_Code_PLAIN; -constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; +constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; -constexpr PropertyConfigProto::DataType::Code TYPE_INT = - PropertyConfigProto::DataType::INT64; +constexpr PropertyConfigProto_DataType_Code TYPE_INT = + PropertyConfigProto_DataType_Code_INT64; UsageReport CreateUsageReport(std::string name_space, std::string uri, int64 timestamp_ms, @@ -3170,6 +3170,15 @@ TEST_F(DocumentStoreTest, DetectCompleteDataLoss) { ASSERT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE)); } +// TODO(b/185845269) Re-enable this test by copying over a full valid set of +// document store files. Right now this test only includes the score_cache and +// the document store header. +// +// This causes a problem now because this cl changes behavior to not consider an +// InitializeExistingDerivedFiles failure to be a recovery if there is nothing +// to recover because the doocument store is empty. +#define DISABLE_BACKWARDS_COMPAT_TEST +#ifndef DISABLE_BACKWARDS_COMPAT_TEST TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) { // The directory testdata/score_cache_without_length_in_tokens/document_store // contains only the scoring_cache and the document_store_header (holding the @@ -3185,26 +3194,29 @@ TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) { // Get src files std::string document_store_without_length_in_tokens; - if (IsAndroidArm() || IsIosPlatform()) { + if (IsAndroidPlatform() || IsIosPlatform()) { document_store_without_length_in_tokens = GetTestFilePath( "icing/testdata/score_cache_without_length_in_tokens/" "document_store_android_ios_compatible"); - } else if (IsAndroidX86()) { - document_store_without_length_in_tokens = GetTestFilePath( - "icing/testdata/score_cache_without_length_in_tokens/" - "document_store_android_x86"); } else { document_store_without_length_in_tokens = GetTestFilePath( "icing/testdata/score_cache_without_length_in_tokens/" "document_store"); } + std::vector<std::string> document_store_files; Filesystem filesystem; - ICING_LOG(INFO) << "Copying files " - << document_store_without_length_in_tokens; - ASSERT_THAT( - filesystem.CopyDirectory(document_store_without_length_in_tokens.c_str(), - document_store_dir_.c_str(), /*recursive=*/true), - true); + filesystem.ListDirectory(document_store_without_length_in_tokens.c_str(), + &document_store_files); + + ICING_LOG(INFO) << "Copying files " << document_store_without_length_in_tokens + << ' ' << document_store_files.size(); + for (size_t i = 0; i != document_store_files.size(); i++) { + std::string src = absl_ports::StrCat( + document_store_without_length_in_tokens, "/", document_store_files[i]); + std::string dst = + absl_ports::StrCat(document_store_dir_, "/", document_store_files[i]); + ASSERT_THAT(filesystem_.CopyFile(src.c_str(), dst.c_str()), true); + } InitializeStatsProto initialize_stats; ICING_ASSERT_OK_AND_ASSIGN( @@ -3215,11 +3227,12 @@ TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) { &initialize_stats)); std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); - // The document log is using the legacy v0 format so that a migration is - // needed, which will also trigger regeneration. - EXPECT_EQ(initialize_stats.document_store_recovery_cause(), - InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT); + // The store_cache trigger regeneration because its element size is + // inconsistent: expected 20 (current new size), actual 12 (as per the v0 + // score_cache). + EXPECT_TRUE(initialize_stats.has_document_store_recovery_cause()); } +#endif // DISABLE_BACKWARDS_COMPAT_TEST TEST_F(DocumentStoreTest, DocumentStoreStorageInfo) { ICING_ASSERT_OK_AND_ASSIGN( @@ -3409,22 +3422,18 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryUpdatesTypeIds) { { // Create the document store the second time and force recovery - InitializeStatsProto initialize_stats; ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, - schema_store.get(), - /*force_recovery_and_revalidate_documents=*/true, - &initialize_stats)); + DocumentStore::Create( + &filesystem_, document_store_dir_, &fake_clock_, schema_store.get(), + /*force_recovery_and_revalidate_documents=*/true)); std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); // Ensure that the type id of the email document has been correctly updated. ICING_ASSERT_OK_AND_ASSIGN(DocumentFilterData filter_data, doc_store->GetDocumentFilterData(docid)); - EXPECT_THAT(filter_data.schema_type_id(), Eq(1)); - EXPECT_THAT(initialize_stats.document_store_recovery_cause(), - Eq(InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC)); + ASSERT_THAT(filter_data.schema_type_id(), Eq(1)); } } @@ -3547,6 +3556,7 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryDeletesInvalidDocument) { SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); ASSERT_THAT(schema_store->SetSchema(schema), IsOk()); + DocumentId docid = kInvalidDocumentId; DocumentProto docWithBody = DocumentBuilder() .SetKey("icing", "email/1") @@ -3579,12 +3589,8 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryDeletesInvalidDocument) { std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); - DocumentId docid = kInvalidDocumentId; ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(docWithBody)); - ASSERT_NE(docid, kInvalidDocumentId); - docid = kInvalidDocumentId; ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(docWithoutBody)); - ASSERT_NE(docid, kInvalidDocumentId); ASSERT_THAT(doc_store->Get(docWithBody.namespace_(), docWithBody.uri()), IsOkAndHolds(EqualsProto(docWithBody))); @@ -3652,6 +3658,7 @@ TEST_F(DocumentStoreTest, InitializeDontForceRecoveryKeepsInvalidDocument) { SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); ASSERT_THAT(schema_store->SetSchema(schema), IsOk()); + DocumentId docid = kInvalidDocumentId; DocumentProto docWithBody = DocumentBuilder() .SetKey("icing", "email/1") @@ -3684,12 +3691,8 @@ TEST_F(DocumentStoreTest, InitializeDontForceRecoveryKeepsInvalidDocument) { std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); - DocumentId docid = kInvalidDocumentId; ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(docWithBody)); - ASSERT_NE(docid, kInvalidDocumentId); - docid = kInvalidDocumentId; ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(docWithoutBody)); - ASSERT_NE(docid, kInvalidDocumentId); ASSERT_THAT(doc_store->Get(docWithBody.namespace_(), docWithBody.uri()), IsOkAndHolds(EqualsProto(docWithBody))); @@ -3832,8 +3835,7 @@ TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) { // Check that we didn't lose anything. A migration also doesn't technically // count as a recovery. EXPECT_THAT(create_result.data_loss, Eq(DataLoss::NONE)); - EXPECT_EQ(initialize_stats.document_store_recovery_cause(), - InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT); + EXPECT_FALSE(initialize_stats.has_document_store_recovery_cause()); // Document 1 and 3 were put normally, and document 2 was deleted in our // testdata files. @@ -3856,164 +3858,6 @@ TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) { } #endif // DISABLE_BACKWARDS_COMPAT_TEST -TEST_F(DocumentStoreTest, GetDebugInfo) { - SchemaProto schema = - SchemaBuilder() - .AddType(SchemaTypeConfigBuilder() - .SetType("email") - .AddProperty( - PropertyConfigBuilder() - .SetName("subject") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL)) - .AddProperty( - PropertyConfigBuilder() - .SetName("body") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) - .AddType(SchemaTypeConfigBuilder().SetType("person").AddProperty( - PropertyConfigBuilder() - .SetName("name") - .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_OPTIONAL))) - .Build(); - std::string schema_store_dir = schema_store_dir_ + "_custom"; - filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); - filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_)); - - ICING_ASSERT_OK(schema_store->SetSchema(schema)); - - ICING_ASSERT_OK_AND_ASSIGN( - DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, - schema_store.get())); - std::unique_ptr<DocumentStore> document_store = - std::move(create_result.document_store); - - DocumentProto document1 = DocumentBuilder() - .SetKey("namespace1", "email/1") - .SetSchema("email") - .AddStringProperty("subject", "aa bb cc") - .AddStringProperty("body", "dd ee") - .SetCreationTimestampMs(1) - .Build(); - ICING_ASSERT_OK(document_store->Put(document1, 5)); - - DocumentProto document2 = DocumentBuilder() - .SetKey("namespace2", "email/2") - .SetSchema("email") - .AddStringProperty("subject", "aa bb") - .AddStringProperty("body", "cc") - .SetCreationTimestampMs(1) - .Build(); - ICING_ASSERT_OK(document_store->Put(document2, 3)); - - DocumentProto document3 = DocumentBuilder() - .SetKey("namespace2", "email/3") - .SetSchema("email") - .AddStringProperty("subject", "aa") - .AddStringProperty("body", "") - .SetCreationTimestampMs(1) - .Build(); - ICING_ASSERT_OK(document_store->Put(document3, 1)); - - DocumentProto document4 = DocumentBuilder() - .SetKey("namespace1", "person/1") - .SetSchema("person") - .AddStringProperty("name", "test test") - .SetCreationTimestampMs(1) - .Build(); - ICING_ASSERT_OK(document_store->Put(document4, 2)); - - ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out1, - document_store->GetDebugInfo(/*verbosity=*/1)); - EXPECT_THAT(out1.crc(), Gt(0)); - EXPECT_THAT(out1.document_storage_info().num_alive_documents(), Eq(4)); - EXPECT_THAT(out1.document_storage_info().num_deleted_documents(), Eq(0)); - EXPECT_THAT(out1.document_storage_info().num_expired_documents(), Eq(0)); - - DocumentDebugInfoProto::CorpusInfo info1, info2, info3; - info1.set_namespace_("namespace1"); - info1.set_schema("email"); - info1.set_total_documents(1); // document1 - info1.set_total_token(5); - - info2.set_namespace_("namespace2"); - info2.set_schema("email"); - info2.set_total_documents(2); // document2 and document3 - info2.set_total_token(4); // 3 + 1 - - info3.set_namespace_("namespace1"); - info3.set_schema("person"); - info3.set_total_documents(1); // document4 - info3.set_total_token(2); - - EXPECT_THAT(out1.corpus_info(), - UnorderedElementsAre(EqualsProto(info1), EqualsProto(info2), - EqualsProto(info3))); - - // Delete document3. - ICING_ASSERT_OK(document_store->Delete("namespace2", "email/3")); - ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out2, - document_store->GetDebugInfo(/*verbosity=*/1)); - EXPECT_THAT(out2.crc(), Gt(0)); - EXPECT_THAT(out2.crc(), Not(Eq(out1.crc()))); - EXPECT_THAT(out2.document_storage_info().num_alive_documents(), Eq(3)); - EXPECT_THAT(out2.document_storage_info().num_deleted_documents(), Eq(1)); - EXPECT_THAT(out2.document_storage_info().num_expired_documents(), Eq(0)); - info2.set_total_documents(1); // document2 - info2.set_total_token(3); - EXPECT_THAT(out2.corpus_info(), - UnorderedElementsAre(EqualsProto(info1), EqualsProto(info2), - EqualsProto(info3))); - - ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out3, - document_store->GetDebugInfo(/*verbosity=*/0)); - EXPECT_THAT(out3.corpus_info(), IsEmpty()); -} - -TEST_F(DocumentStoreTest, GetDebugInfoWithoutSchema) { - std::string schema_store_dir = schema_store_dir_ + "_custom"; - filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); - filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_)); - - ICING_ASSERT_OK_AND_ASSIGN( - DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, - schema_store.get())); - std::unique_ptr<DocumentStore> document_store = - std::move(create_result.document_store); - ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out, - document_store->GetDebugInfo(/*verbosity=*/1)); - EXPECT_THAT(out.crc(), Gt(0)); - EXPECT_THAT(out.document_storage_info().num_alive_documents(), Eq(0)); - EXPECT_THAT(out.document_storage_info().num_deleted_documents(), Eq(0)); - EXPECT_THAT(out.document_storage_info().num_expired_documents(), Eq(0)); - EXPECT_THAT(out.corpus_info(), IsEmpty()); -} - -TEST_F(DocumentStoreTest, GetDebugInfoForEmptyDocumentStore) { - ICING_ASSERT_OK_AND_ASSIGN( - DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, - schema_store_.get())); - std::unique_ptr<DocumentStore> document_store = - std::move(create_result.document_store); - ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out, - document_store->GetDebugInfo(/*verbosity=*/1)); - EXPECT_THAT(out.crc(), Gt(0)); - EXPECT_THAT(out.document_storage_info().num_alive_documents(), Eq(0)); - EXPECT_THAT(out.document_storage_info().num_deleted_documents(), Eq(0)); - EXPECT_THAT(out.document_storage_info().num_expired_documents(), Eq(0)); - EXPECT_THAT(out.corpus_info(), IsEmpty()); -} - } // namespace } // namespace lib diff --git a/icing/store/namespace-checker-impl.h b/icing/store/namespace-checker-impl.h deleted file mode 100644 index bcd0643..0000000 --- a/icing/store/namespace-checker-impl.h +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (C) 2021 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef ICING_STORE_NAMESPACE_CHECKER_IMPL_H_ -#define ICING_STORE_NAMESPACE_CHECKER_IMPL_H_ - -#include "icing/store/document-id.h" -#include "icing/store/document-store.h" -#include "icing/store/namespace-checker.h" -#include "icing/store/namespace-id.h" - -namespace icing { -namespace lib { - -class NamespaceCheckerImpl : public NamespaceChecker { - public: - explicit NamespaceCheckerImpl( - const DocumentStore* document_store, - std::unordered_set<NamespaceId> target_namespace_ids) - : document_store_(*document_store), - target_namespace_ids_(std::move(target_namespace_ids)) {} - - bool BelongsToTargetNamespaces(DocumentId document_id) const override { - if (target_namespace_ids_.empty()) { - return true; - } - auto document_filter_data_or_ = - document_store_.GetDocumentFilterData(document_id); - return document_filter_data_or_.ok() && - target_namespace_ids_.count( - document_filter_data_or_.ValueOrDie().namespace_id())> 0; - } - const DocumentStore& document_store_; - std::unordered_set<NamespaceId> target_namespace_ids_; -}; - -} // namespace lib -} // namespace icing - -#endif // ICING_STORE_NAMESPACE_CHECKER_IMPL_H_
\ No newline at end of file diff --git a/icing/store/namespace-checker.h b/icing/store/namespace-checker.h deleted file mode 100644 index 8812ab1..0000000 --- a/icing/store/namespace-checker.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (C) 2021 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef ICING_STORE_NAMESPACE_CHECKER_H_ -#define ICING_STORE_NAMESPACE_CHECKER_H_ - -#include "icing/store/document-id.h" - -namespace icing { -namespace lib { - -class NamespaceChecker { - public: - virtual ~NamespaceChecker() = default; - - // Check whether the given document id is belongs to the target namespaces. - // Returns: - // On success, - // - true: the given document id belongs to the target namespaces - // - false: the given document id doesn't belong to the target namespaces - // OUT_OF_RANGE if document_id is negative or exceeds previously seen - // DocumentIds - // NOT_FOUND if the document or the filter data is not found - // INTERNAL_ERROR on all other errors - virtual bool BelongsToTargetNamespaces(DocumentId document_id) const = 0; -}; - -} // namespace lib -} // namespace icing - -#endif // ICING_STORE_NAMESPACE_CHECKER_H_ diff --git a/icing/testing/always-true-namespace-checker-impl.h b/icing/testing/always-true-namespace-checker-impl.h deleted file mode 100644 index f7744b6..0000000 --- a/icing/testing/always-true-namespace-checker-impl.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (C) 2021 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef ICING_TESTING_ALWAYS_TRUE_NAMESPACE_CHECKER_IMPL_H_ -#define ICING_TESTING_ALWAYS_TRUE_NAMESPACE_CHECKER_IMPL_H_ - -#include "icing/store/document-id.h" -#include "icing/store/namespace-checker.h" - -namespace icing { -namespace lib { - -class AlwaysTrueNamespaceCheckerImpl : public NamespaceChecker { - public: - bool BelongsToTargetNamespaces(DocumentId document_id) const override { - return true; - } -}; - -} // namespace lib -} // namespace icing - -#endif // ICING_TESTING_ALWAYS_TRUE_NAMESPACE_CHECKER_IMPL_H_
\ No newline at end of file diff --git a/icing/testing/common-matchers.h b/icing/testing/common-matchers.h index f83fe0a..8d8bdf2 100644 --- a/icing/testing/common-matchers.h +++ b/icing/testing/common-matchers.h @@ -121,6 +121,7 @@ MATCHER_P(EqualsSetSchemaResult, expected, "") { const SchemaStore::SetSchemaResult& actual = arg; if (actual.success == expected.success && + actual.index_incompatible == expected.index_incompatible && actual.old_schema_type_ids_changed == expected.old_schema_type_ids_changed && actual.schema_types_deleted_by_name == @@ -130,12 +131,7 @@ MATCHER_P(EqualsSetSchemaResult, expected, "") { actual.schema_types_incompatible_by_name == expected.schema_types_incompatible_by_name && actual.schema_types_incompatible_by_id == - expected.schema_types_incompatible_by_id && - actual.schema_types_new_by_name == expected.schema_types_new_by_name && - actual.schema_types_changed_fully_compatible_by_name == - expected.schema_types_changed_fully_compatible_by_name && - actual.schema_types_index_incompatible_by_name == - expected.schema_types_index_incompatible_by_name) { + expected.schema_types_incompatible_by_id) { return true; } @@ -195,82 +191,37 @@ MATCHER_P(EqualsSetSchemaResult, expected, "") { absl_ports::NumberFormatter()), "]"); - // Format schema_types_new_by_name - std::string actual_schema_types_new_by_name = absl_ports::StrCat( - "[", absl_ports::StrJoin(actual.schema_types_new_by_name, ","), "]"); - - std::string expected_schema_types_new_by_name = absl_ports::StrCat( - "[", absl_ports::StrJoin(expected.schema_types_new_by_name, ","), "]"); - - // Format schema_types_changed_fully_compatible_by_name - std::string actual_schema_types_changed_fully_compatible_by_name = - absl_ports::StrCat( - "[", - absl_ports::StrJoin( - actual.schema_types_changed_fully_compatible_by_name, ","), - "]"); - - std::string expected_schema_types_changed_fully_compatible_by_name = - absl_ports::StrCat( - "[", - absl_ports::StrJoin( - expected.schema_types_changed_fully_compatible_by_name, ","), - "]"); - - // Format schema_types_deleted_by_id - std::string actual_schema_types_index_incompatible_by_name = - absl_ports::StrCat( - "[", - absl_ports::StrJoin(actual.schema_types_index_incompatible_by_name, - ","), - "]"); - - std::string expected_schema_types_index_incompatible_by_name = - absl_ports::StrCat( - "[", - absl_ports::StrJoin(expected.schema_types_index_incompatible_by_name, - ","), - "]"); - *result_listener << IcingStringUtil::StringPrintf( "\nExpected {\n" "\tsuccess=%d,\n" + "\tindex_incompatible=%d,\n" "\told_schema_type_ids_changed=%s,\n" "\tschema_types_deleted_by_name=%s,\n" "\tschema_types_deleted_by_id=%s,\n" "\tschema_types_incompatible_by_name=%s,\n" "\tschema_types_incompatible_by_id=%s\n" - "\tschema_types_new_by_name=%s,\n" - "\tschema_types_index_incompatible_by_name=%s,\n" - "\tschema_types_changed_fully_compatible_by_name=%s\n" "}\n" "Actual {\n" "\tsuccess=%d,\n" + "\tindex_incompatible=%d,\n" "\told_schema_type_ids_changed=%s,\n" "\tschema_types_deleted_by_name=%s,\n" "\tschema_types_deleted_by_id=%s,\n" "\tschema_types_incompatible_by_name=%s,\n" "\tschema_types_incompatible_by_id=%s\n" - "\tschema_types_new_by_name=%s,\n" - "\tschema_types_index_incompatible_by_name=%s,\n" - "\tschema_types_changed_fully_compatible_by_name=%s\n" "}\n", - expected.success, expected_old_schema_type_ids_changed.c_str(), + expected.success, expected.index_incompatible, + expected_old_schema_type_ids_changed.c_str(), expected_schema_types_deleted_by_name.c_str(), expected_schema_types_deleted_by_id.c_str(), expected_schema_types_incompatible_by_name.c_str(), - expected_schema_types_incompatible_by_id.c_str(), - expected_schema_types_new_by_name.c_str(), - expected_schema_types_changed_fully_compatible_by_name.c_str(), - expected_schema_types_index_incompatible_by_name.c_str(), actual.success, - actual_old_schema_type_ids_changed.c_str(), + expected_schema_types_incompatible_by_id.c_str(), actual.success, + actual.index_incompatible, actual_old_schema_type_ids_changed.c_str(), actual_schema_types_deleted_by_name.c_str(), actual_schema_types_deleted_by_id.c_str(), actual_schema_types_incompatible_by_name.c_str(), - actual_schema_types_incompatible_by_id.c_str(), - actual_schema_types_new_by_name.c_str(), - actual_schema_types_changed_fully_compatible_by_name.c_str(), - actual_schema_types_index_incompatible_by_name.c_str()); + actual_schema_types_incompatible_by_id.c_str()); + return false; } diff --git a/icing/testing/random-string.cc b/icing/testing/random-string.cc deleted file mode 100644 index 27f83bc..0000000 --- a/icing/testing/random-string.cc +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (C) 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "icing/testing/random-string.h" - -namespace icing { -namespace lib { - -std::vector<std::string> GenerateUniqueTerms(int num_terms) { - char before_a = 'a' - 1; - std::string term(1, before_a); - std::vector<std::string> terms; - int current_char = 0; - for (int permutation = 0; permutation < num_terms; ++permutation) { - if (term[current_char] != 'z') { - ++term[current_char]; - } else { - if (current_char < term.length() - 1) { - // The string currently looks something like this "zzzaa" - // 1. Find the first char after this one that isn't - current_char = term.find_first_not_of('z', current_char); - if (current_char != std::string::npos) { - // 2. Increment that character - ++term[current_char]; - - // 3. Set every character prior to current_char to 'a' - term.replace(0, current_char, current_char, 'a'); - } else { - // Every character in this string is a 'z'. We need to grow. - term = std::string(term.length() + 1, 'a'); - } - } else { - term = std::string(term.length() + 1, 'a'); - } - current_char = 0; - } - terms.push_back(term); - } - return terms; -} - -} // namespace lib -} // namespace icing diff --git a/icing/testing/random-string.h b/icing/testing/random-string.h index fd8d87b..ac36924 100644 --- a/icing/testing/random-string.h +++ b/icing/testing/random-string.h @@ -15,7 +15,6 @@ #ifndef ICING_TESTING_RANDOM_STRING_H_ #define ICING_TESTING_RANDOM_STRING_H_ -#include <algorithm> #include <random> #include <string> @@ -37,10 +36,6 @@ std::string RandomString(const std::string_view alphabet, size_t len, return result; } -// Returns a vector containing num_terms unique terms. Terms are created in -// non-random order starting with "a" to "z" to "aa" to "zz", etc. -std::vector<std::string> GenerateUniqueTerms(int num_terms); - } // namespace lib } // namespace icing diff --git a/icing/testing/random-string_test.cc b/icing/testing/random-string_test.cc deleted file mode 100644 index 759fec0..0000000 --- a/icing/testing/random-string_test.cc +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (C) 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "icing/testing/random-string.h" - -#include "gmock/gmock.h" -#include "gtest/gtest.h" - -using ::testing::ElementsAre; -using ::testing::Eq; -using ::testing::IsEmpty; - -namespace icing { -namespace lib { - -namespace { - -TEST(RandomStringTest, GenerateUniqueTerms) { - EXPECT_THAT(GenerateUniqueTerms(0), IsEmpty()); - EXPECT_THAT(GenerateUniqueTerms(1), ElementsAre("a")); - EXPECT_THAT(GenerateUniqueTerms(4), ElementsAre("a", "b", "c", "d")); - EXPECT_THAT(GenerateUniqueTerms(29), - ElementsAre("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", - "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", - "w", "x", "y", "z", "aa", "ba", "ca")); - EXPECT_THAT(GenerateUniqueTerms(56), - ElementsAre("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", - "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", - "w", "x", "y", "z", "aa", "ba", "ca", "da", "ea", - "fa", "ga", "ha", "ia", "ja", "ka", "la", "ma", "na", - "oa", "pa", "qa", "ra", "sa", "ta", "ua", "va", "wa", - "xa", "ya", "za", "ab", "bb", "cb", "db")); - EXPECT_THAT(GenerateUniqueTerms(56).at(54), Eq("cb")); - EXPECT_THAT(GenerateUniqueTerms(26 * 26 * 26).at(26), Eq("aa")); - EXPECT_THAT(GenerateUniqueTerms(26 * 26 * 26).at(26 * 27), Eq("aaa")); - EXPECT_THAT(GenerateUniqueTerms(26 * 26 * 26).at(26 * 27 - 6), Eq("uz")); - EXPECT_THAT(GenerateUniqueTerms(26 * 26 * 26).at(26 * 27 + 5), Eq("faa")); -} - -} // namespace - -} // namespace lib -} // namespace icing diff --git a/icing/testing/snippet-helpers.cc b/icing/testing/snippet-helpers.cc index 7a71987..cfd20c2 100644 --- a/icing/testing/snippet-helpers.cc +++ b/icing/testing/snippet-helpers.cc @@ -77,16 +77,6 @@ std::vector<std::string_view> GetMatches( return matches; } -std::vector<std::string_view> GetSubMatches( - std::string_view content, const SnippetProto::EntryProto& snippet_proto) { - std::vector<std::string_view> matches; - for (const SnippetMatchProto& match : snippet_proto.snippet_matches()) { - matches.push_back(content.substr(match.exact_match_byte_position(), - match.submatch_byte_length())); - } - return matches; -} - std::string_view GetString(const DocumentProto* document, std::string_view property_path) { std::vector<std::string_view> properties = diff --git a/icing/testing/snippet-helpers.h b/icing/testing/snippet-helpers.h index 73b2ce2..defadeb 100644 --- a/icing/testing/snippet-helpers.h +++ b/icing/testing/snippet-helpers.h @@ -40,10 +40,6 @@ std::vector<std::string_view> GetWindows( std::vector<std::string_view> GetMatches( std::string_view content, const SnippetProto::EntryProto& snippet_proto); -// Retrieves all submatches defined by the snippet_proto for the content. -std::vector<std::string_view> GetSubMatches( - std::string_view content, const SnippetProto::EntryProto& snippet_proto); - // Retrieves the string value held in the document corresponding to the // property_path. // Example: diff --git a/icing/tokenization/combined-tokenizer_test.cc b/icing/tokenization/combined-tokenizer_test.cc deleted file mode 100644 index 0212e4f..0000000 --- a/icing/tokenization/combined-tokenizer_test.cc +++ /dev/null @@ -1,232 +0,0 @@ -// Copyright (C) 2022 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include <string_view> -#include <vector> - -#include "testing/base/public/gmock.h" -#include "testing/base/public/gunit.h" -#include "third_party/icing/portable/platform.h" -#include "third_party/icing/proto/schema_proto_portable.pb.h" -#include "third_party/icing/testing/common-matchers.h" -#include "third_party/icing/testing/icu-data-file-helper.h" -#include "third_party/icing/testing/jni-test-helpers.h" -#include "third_party/icing/testing/test-data.h" -#include "third_party/icing/tokenization/language-segmenter-factory.h" -#include "third_party/icing/tokenization/language-segmenter.h" -#include "third_party/icing/tokenization/tokenizer-factory.h" -#include "third_party/icing/tokenization/tokenizer.h" -#include "third_party/icu/include/unicode/uloc.h" - -namespace icing { -namespace lib { - -namespace { - -using ::testing::ElementsAre; - -// This test exists to ensure that the different tokenizers treat different -// segments of text in the same manner. -class CombinedTokenizerTest : public ::testing::Test { - protected: - void SetUp() override { - if (!IsCfStringTokenization() && !IsReverseJniTokenization()) { - ICING_ASSERT_OK( - // File generated via icu_data_file rule in //third_party/icing/BUILD. - icu_data_file_helper::SetUpICUDataFile( - GetTestFilePath("third_party/icing/icu.dat"))); - } - jni_cache_ = GetTestJniCache(); - - language_segmenter_factory::SegmenterOptions options(ULOC_US, - jni_cache_.get()); - ICING_ASSERT_OK_AND_ASSIGN( - lang_segmenter_, - language_segmenter_factory::Create(std::move(options))); - } - - std::unique_ptr<const JniCache> jni_cache_; - std::unique_ptr<LanguageSegmenter> lang_segmenter_; -}; - -std::vector<std::string> GetTokenTerms(const std::vector<Token>& tokens) { - std::vector<std::string> terms; - terms.reserve(tokens.size()); - for (const Token& token : tokens) { - if (token.type == Token::Type::REGULAR) { - terms.push_back(std::string(token.text)); - } - } - return terms; -} - -} // namespace - -TEST_F(CombinedTokenizerTest, SpecialCharacters) { - const std::string_view kText = "😊 Hello! Goodbye?"; - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Tokenizer> indexing_tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get())); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Tokenizer> query_tokenizer, - CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY, - lang_segmenter_.get())); - - ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens, - indexing_tokenizer->TokenizeAll(kText)); - std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens); - EXPECT_THAT(indexing_terms, ElementsAre("😊", "Hello", "Goodbye")); - - ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens, - query_tokenizer->TokenizeAll(kText)); - std::vector<std::string> query_terms = GetTokenTerms(query_tokens); - EXPECT_THAT(query_terms, ElementsAre("😊", "Hello", "Goodbye")); -} - -TEST_F(CombinedTokenizerTest, Parentheses) { - const std::string_view kText = "((paren1)(paren2) (last paren))"; - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Tokenizer> indexing_tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get())); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Tokenizer> query_tokenizer, - CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY, - lang_segmenter_.get())); - - ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens, - indexing_tokenizer->TokenizeAll(kText)); - std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens); - EXPECT_THAT(indexing_terms, ElementsAre("paren1", "paren2", "last", "paren")); - - ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens, - query_tokenizer->TokenizeAll(kText)); - std::vector<std::string> query_terms = GetTokenTerms(query_tokens); - EXPECT_THAT(query_terms, ElementsAre("paren1", "paren2", "last", "paren")); -} - -TEST_F(CombinedTokenizerTest, Negation) { - const std::string_view kText = "-foo -bar -baz"; - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Tokenizer> indexing_tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get())); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Tokenizer> query_tokenizer, - CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY, - lang_segmenter_.get())); - - ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens, - indexing_tokenizer->TokenizeAll(kText)); - std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens); - EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz")); - - ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens, - query_tokenizer->TokenizeAll(kText)); - std::vector<std::string> query_terms = GetTokenTerms(query_tokens); - EXPECT_THAT(query_terms, ElementsAre("foo", "bar", "baz")); -} - -TEST_F(CombinedTokenizerTest, Colons) { - const std::string_view kText = ":foo: :bar baz:"; - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Tokenizer> indexing_tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get())); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Tokenizer> query_tokenizer, - CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY, - lang_segmenter_.get())); - - ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens, - indexing_tokenizer->TokenizeAll(kText)); - std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens); - EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz")); - - ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens, - query_tokenizer->TokenizeAll(kText)); - std::vector<std::string> query_terms = GetTokenTerms(query_tokens); - EXPECT_THAT(query_terms, ElementsAre("foo", "bar", "baz")); -} - -TEST_F(CombinedTokenizerTest, ColonsPropertyRestricts) { - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Tokenizer> indexing_tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get())); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Tokenizer> query_tokenizer, - CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY, - lang_segmenter_.get())); - - // This is a difference between the two tokenizers. "foo:bar" is a single - // token to the plain tokenizer because ':' is a word connector. But "foo:bar" - // is a property restrict to the query tokenizer - so "foo" is the property - // and "bar" is the only text term. - constexpr std::string_view kText = "foo:bar"; - ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens, - indexing_tokenizer->TokenizeAll(kText)); - std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens); - EXPECT_THAT(indexing_terms, ElementsAre("foo:bar")); - - ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens, - query_tokenizer->TokenizeAll(kText)); - std::vector<std::string> query_terms = GetTokenTerms(query_tokens); - EXPECT_THAT(query_terms, ElementsAre("bar")); - - // This difference, however, should only apply to the first ':'. A - // second ':' should be treated by both tokenizers as a word connector. - constexpr std::string_view kText2 = "foo:bar:baz"; - ICING_ASSERT_OK_AND_ASSIGN(indexing_tokens, - indexing_tokenizer->TokenizeAll(kText2)); - indexing_terms = GetTokenTerms(indexing_tokens); - EXPECT_THAT(indexing_terms, ElementsAre("foo:bar:baz")); - - ICING_ASSERT_OK_AND_ASSIGN(query_tokens, - query_tokenizer->TokenizeAll(kText2)); - query_terms = GetTokenTerms(query_tokens); - EXPECT_THAT(query_terms, ElementsAre("bar:baz")); -} - -TEST_F(CombinedTokenizerTest, Punctuation) { - const std::string_view kText = "Who? What!? Why & How."; - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Tokenizer> indexing_tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get())); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Tokenizer> query_tokenizer, - CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY, - lang_segmenter_.get())); - - ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens, - indexing_tokenizer->TokenizeAll(kText)); - std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens); - EXPECT_THAT(indexing_terms, ElementsAre("Who", "What", "Why", "How")); - - ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens, - query_tokenizer->TokenizeAll(kText)); - std::vector<std::string> query_terms = GetTokenTerms(query_tokens); - EXPECT_THAT(query_terms, ElementsAre("Who", "What", "Why", "How")); -} - -} // namespace lib -} // namespace icing diff --git a/icing/tokenization/icu/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc index dc7b0a4..cb31441 100644 --- a/icing/tokenization/icu/icu-language-segmenter.cc +++ b/icing/tokenization/icu/icu-language-segmenter.cc @@ -59,7 +59,7 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { ~IcuLanguageSegmenterIterator() { ubrk_close(break_iterator_); - utext_close(u_text_); + utext_close(&u_text_); } // Advances to the next term. Returns false if it has reached the end. @@ -83,6 +83,9 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { return false; } + if (!IsValidSegment()) { + return Advance(); + } return true; } @@ -223,7 +226,8 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { return absl_ports::AbortedError( "Could not retrieve valid utf8 character!"); } - if (term_end_index_exclusive_ > offset_iterator_.utf8_index()) { + if (term_end_index_exclusive_ > offset_iterator_.utf8_index() || + !IsValidSegment()) { return ResetToTermEndingBeforeUtf32(term_start_iterator.utf32_index()); } return term_start_iterator.utf32_index(); @@ -249,7 +253,7 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { : break_iterator_(nullptr), text_(text), locale_(locale), - u_text_(nullptr), + u_text_(UTEXT_INITIALIZER), offset_iterator_(text), term_start_index_(0), term_end_index_exclusive_(0) {} @@ -257,13 +261,10 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { // Returns true on success bool Initialize() { UErrorCode status = U_ZERO_ERROR; - u_text_ = utext_openUTF8(nullptr, text_.data(), text_.length(), &status); - if (u_text_ == nullptr) { - return false; - } + utext_openUTF8(&u_text_, text_.data(), text_.length(), &status); break_iterator_ = ubrk_open(UBRK_WORD, locale_.data(), /*text=*/nullptr, /*textLength=*/0, &status); - ubrk_setUText(break_iterator_, u_text_, &status); + ubrk_setUText(break_iterator_, &u_text_, &status); return !U_FAILURE(status); } @@ -290,6 +291,23 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { term_start_index_ = 0; } + bool IsValidSegment() const { + // Rule 1: all ASCII terms will be returned. + // We know it's a ASCII term by checking the first char. + if (i18n_utils::IsAscii(text_[term_start_index_])) { + return true; + } + + UChar32 uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(), + term_start_index_); + // Rule 2: for non-ASCII terms, only the alphabetic terms are returned. + // We know it's an alphabetic term by checking the first unicode character. + if (u_isUAlphabetic(uchar32)) { + return true; + } + return false; + } + // The underlying class that does the segmentation, ubrk_close() must be // called after using. UBreakIterator* break_iterator_; @@ -303,8 +321,8 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { std::string_view locale_; // A thin wrapper around the input UTF8 text, needed by break_iterator_. - // Allocated by calling utext_openUtf8() and freed by calling utext_close(). - UText* u_text_; + // utext_close() must be called after using. + UText u_text_; // Offset iterator. This iterator is not guaranteed to point to any particular // character, but is guaranteed to point to a valid UTF character sequence. diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc index 4098be5..01eb7d8 100644 --- a/icing/tokenization/icu/icu-language-segmenter_test.cc +++ b/icing/tokenization/icu/icu-language-segmenter_test.cc @@ -21,8 +21,8 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/absl_ports/str_cat.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/testing/common-matchers.h" -#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/icu-i18n-test-utils.h" #include "icing/testing/jni-test-helpers.h" #include "icing/testing/test-data.h" @@ -191,7 +191,7 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, Non_ASCII_Non_Alphabetic) { // Full-width (non-ASCII) punctuation marks and special characters are left // out. EXPECT_THAT(language_segmenter->GetAllTerms("。?·Hello!×"), - IsOkAndHolds(ElementsAre("。", "?", "·", "Hello", "!", "×"))); + IsOkAndHolds(ElementsAre("Hello"))); } TEST_P(IcuLanguageSegmenterAllLocalesTest, Acronym) { @@ -252,9 +252,9 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) { // Connectors don't connect if one side is an invalid term (?) EXPECT_THAT(language_segmenter->GetAllTerms("bar:baz:?"), - IsOkAndHolds(ElementsAre("bar:baz", ":", "?"))); + IsOkAndHolds(ElementsAre("bar:baz", ":"))); EXPECT_THAT(language_segmenter->GetAllTerms("?:bar:baz"), - IsOkAndHolds(ElementsAre("?", ":", "bar:baz"))); + IsOkAndHolds(ElementsAre(":", "bar:baz"))); EXPECT_THAT(language_segmenter->GetAllTerms("3:14"), IsOkAndHolds(ElementsAre("3", ":", "14"))); EXPECT_THAT(language_segmenter->GetAllTerms("私:は"), @@ -372,15 +372,6 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, Number) { IsOkAndHolds(ElementsAre("-", "123"))); } -TEST_P(IcuLanguageSegmenterAllLocalesTest, FullWidthNumbers) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create( - GetSegmenterOptions(GetLocale(), jni_cache_.get()))); - EXPECT_THAT(language_segmenter->GetAllTerms("0123456789"), - IsOkAndHolds(ElementsAre("0123456789"))); -} - TEST_P(IcuLanguageSegmenterAllLocalesTest, ContinuousWhitespaces) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, @@ -417,16 +408,15 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, CJKT) { // have whitespaces as word delimiter. // Chinese - EXPECT_THAT( - language_segmenter->GetAllTerms("我每天走路去上班。"), - IsOkAndHolds(ElementsAre("我", "每天", "走路", "去", "上班", "。"))); + EXPECT_THAT(language_segmenter->GetAllTerms("我每天走路去上班。"), + IsOkAndHolds(ElementsAre("我", "每天", "走路", "去", "上班"))); // Japanese EXPECT_THAT(language_segmenter->GetAllTerms("私は毎日仕事に歩いています。"), IsOkAndHolds(ElementsAre("私", "は", "毎日", "仕事", "に", "歩", - "い", "てい", "ます", "。"))); + "い", "てい", "ます"))); // Khmer EXPECT_THAT(language_segmenter->GetAllTerms("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"), - IsOkAndHolds(ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ", "។"))); + IsOkAndHolds(ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ"))); // Thai EXPECT_THAT( language_segmenter->GetAllTerms("ฉันเดินไปทำงานทุกวัน"), @@ -859,19 +849,16 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermAfterUtf32) { ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kChinese)); // String: "我每天走路去上班。" - // ^ ^ ^ ^^ ^ - // UTF-8 idx: 0 3 9 15 18 24 - // UTF-832 idx: 0 1 3 5 6 8 + // ^ ^ ^ ^^ + // UTF-8 idx: 0 3 9 15 18 + // UTF-832 idx: 0 1 3 5 6 EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1))); EXPECT_THAT(itr->GetTerm(), Eq("每天")); EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3))); EXPECT_THAT(itr->GetTerm(), Eq("走路")); - EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8))); - EXPECT_THAT(itr->GetTerm(), Eq("。")); - - EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(8), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); } @@ -886,21 +873,18 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermAfterUtf32) { ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kJapanese)); // String: "私は毎日仕事に歩いています。" - // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ - // UTF-8 idx: 0 3 6 12 18212427 33 39 - // UTF-32 idx: 0 1 2 4 6 7 8 9 11 13 + // ^ ^ ^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 3 6 12 18212427 33 + // UTF-32 idx: 0 1 2 4 6 7 8 9 11 EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1))); EXPECT_THAT(itr->GetTerm(), Eq("は")); - EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(13), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(11), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(3), IsOkAndHolds(Eq(4))); EXPECT_THAT(itr->GetTerm(), Eq("仕事")); - - EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(13))); - EXPECT_THAT(itr->GetTerm(), Eq("。")); } TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermAfterUtf32) { @@ -912,16 +896,13 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermAfterUtf32) { ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kKhmer)); // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។" - // ^ ^ ^ ^ ^ - // UTF-8 idx: 0 9 24 45 69 - // UTF-32 idx: 0 3 8 15 23 + // ^ ^ ^ ^ + // UTF-8 idx: 0 9 24 45 + // UTF-32 idx: 0 3 8 15 EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3))); EXPECT_THAT(itr->GetTerm(), Eq("ដើរទៅ")); - EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15), IsOkAndHolds(Eq(23))); - EXPECT_THAT(itr->GetTerm(), Eq("។")); - - EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(23), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); diff --git a/icing/tokenization/language-segmenter-iterator_test.cc b/icing/tokenization/language-segmenter-iterator_test.cc index 3aff45c..d293581 100644 --- a/icing/tokenization/language-segmenter-iterator_test.cc +++ b/icing/tokenization/language-segmenter-iterator_test.cc @@ -15,9 +15,9 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/absl_ports/str_cat.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/platform.h" #include "icing/testing/common-matchers.h" -#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/jni-test-helpers.h" #include "icing/testing/test-data.h" #include "icing/tokenization/language-segmenter-factory.h" diff --git a/icing/tokenization/language-segmenter_benchmark.cc b/icing/tokenization/language-segmenter_benchmark.cc index 6f7d4df..bd86169 100644 --- a/icing/tokenization/language-segmenter_benchmark.cc +++ b/icing/tokenization/language-segmenter_benchmark.cc @@ -14,8 +14,8 @@ #include "testing/base/public/benchmark.h" #include "gmock/gmock.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/testing/common-matchers.h" -#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/test-data.h" #include "icing/tokenization/language-segmenter-factory.h" #include "icing/tokenization/language-segmenter.h" diff --git a/icing/tokenization/plain-tokenizer.cc b/icing/tokenization/plain-tokenizer.cc index 7a1949f..13fe550 100644 --- a/icing/tokenization/plain-tokenizer.cc +++ b/icing/tokenization/plain-tokenizer.cc @@ -66,9 +66,9 @@ class PlainTokenIterator : public Tokenizer::Iterator { Token GetToken() const override { if (current_term_.empty()) { - return Token(Token::Type::INVALID); + return Token(Token::INVALID); } - return Token(Token::Type::REGULAR, current_term_); + return Token(Token::REGULAR, current_term_); } libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart() @@ -81,8 +81,8 @@ class PlainTokenIterator : public Tokenizer::Iterator { return base_iterator_->CalculateTermEndExclusive(); } - bool ResetToTokenStartingAfter(int32_t utf32_offset) override { - if (!base_iterator_->ResetToTermStartingAfterUtf32(utf32_offset).ok()) { + bool ResetToTokenAfter(int32_t offset) override { + if (!base_iterator_->ResetToTermStartingAfterUtf32(offset).ok()) { return false; } current_term_ = base_iterator_->GetTerm(); @@ -93,17 +93,15 @@ class PlainTokenIterator : public Tokenizer::Iterator { return true; } - bool ResetToTokenEndingBefore(int32_t utf32_offset) override { + bool ResetToTokenBefore(int32_t offset) override { ICING_ASSIGN_OR_RETURN( - utf32_offset, - base_iterator_->ResetToTermEndingBeforeUtf32(utf32_offset), false); + offset, base_iterator_->ResetToTermEndingBeforeUtf32(offset), false); current_term_ = base_iterator_->GetTerm(); while (!IsValidTerm(current_term_)) { // Haven't found a valid term yet. Retrieve the term prior to this one // from the segmenter. ICING_ASSIGN_OR_RETURN( - utf32_offset, - base_iterator_->ResetToTermEndingBeforeUtf32(utf32_offset), false); + offset, base_iterator_->ResetToTermEndingBeforeUtf32(offset), false); current_term_ = base_iterator_->GetTerm(); } return true; diff --git a/icing/tokenization/plain-tokenizer_test.cc b/icing/tokenization/plain-tokenizer_test.cc index c48b51e..7490bfa 100644 --- a/icing/tokenization/plain-tokenizer_test.cc +++ b/icing/tokenization/plain-tokenizer_test.cc @@ -18,9 +18,9 @@ #include "gmock/gmock.h" #include "icing/absl_ports/str_cat.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/platform.h" #include "icing/testing/common-matchers.h" -#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/icu-i18n-test-utils.h" #include "icing/testing/jni-test-helpers.h" #include "icing/testing/test-data.h" @@ -68,27 +68,26 @@ TEST_F(PlainTokenizerTest, Simple) { EXPECT_THAT(plain_tokenizer->TokenizeAll(""), IsOkAndHolds(IsEmpty())); - EXPECT_THAT( - plain_tokenizer->TokenizeAll("Hello World"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"), - EqualsToken(Token::Type::REGULAR, "World")))); + EXPECT_THAT(plain_tokenizer->TokenizeAll("Hello World"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"), + EqualsToken(Token::REGULAR, "World")))); EXPECT_THAT( plain_tokenizer->TokenizeAll( "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " "Duis efficitur iaculis auctor."), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Lorem"), - EqualsToken(Token::Type::REGULAR, "ipsum"), - EqualsToken(Token::Type::REGULAR, "dolor"), - EqualsToken(Token::Type::REGULAR, "sit"), - EqualsToken(Token::Type::REGULAR, "amet"), - EqualsToken(Token::Type::REGULAR, "consectetur"), - EqualsToken(Token::Type::REGULAR, "adipiscing"), - EqualsToken(Token::Type::REGULAR, "elit"), - EqualsToken(Token::Type::REGULAR, "Duis"), - EqualsToken(Token::Type::REGULAR, "efficitur"), - EqualsToken(Token::Type::REGULAR, "iaculis"), - EqualsToken(Token::Type::REGULAR, "auctor")))); + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Lorem"), + EqualsToken(Token::REGULAR, "ipsum"), + EqualsToken(Token::REGULAR, "dolor"), + EqualsToken(Token::REGULAR, "sit"), + EqualsToken(Token::REGULAR, "amet"), + EqualsToken(Token::REGULAR, "consectetur"), + EqualsToken(Token::REGULAR, "adipiscing"), + EqualsToken(Token::REGULAR, "elit"), + EqualsToken(Token::REGULAR, "Duis"), + EqualsToken(Token::REGULAR, "efficitur"), + EqualsToken(Token::REGULAR, "iaculis"), + EqualsToken(Token::REGULAR, "auctor")))); } TEST_F(PlainTokenizerTest, Whitespace) { @@ -108,18 +107,16 @@ TEST_F(PlainTokenizerTest, Whitespace) { // 0x0009 is horizontal tab, considered as a whitespace std::string text_with_horizontal_tab = absl_ports::StrCat("Hello", UCharToString(0x0009), "World"); - EXPECT_THAT( - plain_tokenizer->TokenizeAll(text_with_horizontal_tab), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"), - EqualsToken(Token::Type::REGULAR, "World")))); + EXPECT_THAT(plain_tokenizer->TokenizeAll(text_with_horizontal_tab), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"), + EqualsToken(Token::REGULAR, "World")))); // 0x000B is vertical tab, considered as a whitespace std::string text_with_vertical_tab = absl_ports::StrCat("Hello", UCharToString(0x000B), "World"); - EXPECT_THAT( - plain_tokenizer->TokenizeAll(text_with_vertical_tab), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"), - EqualsToken(Token::Type::REGULAR, "World")))); + EXPECT_THAT(plain_tokenizer->TokenizeAll(text_with_vertical_tab), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"), + EqualsToken(Token::REGULAR, "World")))); } TEST_F(PlainTokenizerTest, Punctuation) { @@ -134,39 +131,38 @@ TEST_F(PlainTokenizerTest, Punctuation) { language_segmenter.get())); // Half-width punctuation marks are filtered out. - EXPECT_THAT( - plain_tokenizer->TokenizeAll( - "Hello, World! Hello: World. \"Hello\" World?"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"), - EqualsToken(Token::Type::REGULAR, "World"), - EqualsToken(Token::Type::REGULAR, "Hello"), - EqualsToken(Token::Type::REGULAR, "World"), - EqualsToken(Token::Type::REGULAR, "Hello"), - EqualsToken(Token::Type::REGULAR, "World")))); + EXPECT_THAT(plain_tokenizer->TokenizeAll( + "Hello, World! Hello: World. \"Hello\" World?"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"), + EqualsToken(Token::REGULAR, "World"), + EqualsToken(Token::REGULAR, "Hello"), + EqualsToken(Token::REGULAR, "World"), + EqualsToken(Token::REGULAR, "Hello"), + EqualsToken(Token::REGULAR, "World")))); // Full-width punctuation marks are filtered out. std::vector<std::string_view> exp_tokens; if (IsCfStringTokenization()) { EXPECT_THAT( plain_tokenizer->TokenizeAll("你好,世界!你好:世界。“你好”世界?"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "你"), - EqualsToken(Token::Type::REGULAR, "好"), - EqualsToken(Token::Type::REGULAR, "世界"), - EqualsToken(Token::Type::REGULAR, "你"), - EqualsToken(Token::Type::REGULAR, "好"), - EqualsToken(Token::Type::REGULAR, "世界"), - EqualsToken(Token::Type::REGULAR, "你"), - EqualsToken(Token::Type::REGULAR, "好"), - EqualsToken(Token::Type::REGULAR, "世界")))); + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "你"), + EqualsToken(Token::REGULAR, "好"), + EqualsToken(Token::REGULAR, "世界"), + EqualsToken(Token::REGULAR, "你"), + EqualsToken(Token::REGULAR, "好"), + EqualsToken(Token::REGULAR, "世界"), + EqualsToken(Token::REGULAR, "你"), + EqualsToken(Token::REGULAR, "好"), + EqualsToken(Token::REGULAR, "世界")))); } else { EXPECT_THAT( plain_tokenizer->TokenizeAll("你好,世界!你好:世界。“你好”世界?"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "你好"), - EqualsToken(Token::Type::REGULAR, "世界"), - EqualsToken(Token::Type::REGULAR, "你好"), - EqualsToken(Token::Type::REGULAR, "世界"), - EqualsToken(Token::Type::REGULAR, "你好"), - EqualsToken(Token::Type::REGULAR, "世界")))); + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "你好"), + EqualsToken(Token::REGULAR, "世界"), + EqualsToken(Token::REGULAR, "你好"), + EqualsToken(Token::REGULAR, "世界"), + EqualsToken(Token::REGULAR, "你好"), + EqualsToken(Token::REGULAR, "世界")))); } } @@ -184,16 +180,14 @@ TEST_F(PlainTokenizerTest, SpecialCharacters) { // Right now we don't have special logic for these characters, just output // them as tokens. - EXPECT_THAT( - plain_tokenizer->TokenizeAll("1+1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "1"), - EqualsToken(Token::Type::REGULAR, "+"), - EqualsToken(Token::Type::REGULAR, "1")))); + EXPECT_THAT(plain_tokenizer->TokenizeAll("1+1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "1"), + EqualsToken(Token::REGULAR, "+"), + EqualsToken(Token::REGULAR, "1")))); - EXPECT_THAT( - plain_tokenizer->TokenizeAll("$50"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "$"), - EqualsToken(Token::Type::REGULAR, "50")))); + EXPECT_THAT(plain_tokenizer->TokenizeAll("$50"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "$"), + EqualsToken(Token::REGULAR, "50")))); } TEST_F(PlainTokenizerTest, CJKT) { @@ -209,13 +203,12 @@ TEST_F(PlainTokenizerTest, CJKT) { tokenizer_factory::CreateIndexingTokenizer( StringIndexingConfig::TokenizerType::PLAIN, language_segmenter.get())); - EXPECT_THAT( - plain_tokenizer->TokenizeAll("我每天走路去上班。"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "我"), - EqualsToken(Token::Type::REGULAR, "每天"), - EqualsToken(Token::Type::REGULAR, "走路"), - EqualsToken(Token::Type::REGULAR, "去"), - EqualsToken(Token::Type::REGULAR, "上班")))); + EXPECT_THAT(plain_tokenizer->TokenizeAll("我每天走路去上班。"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "我"), + EqualsToken(Token::REGULAR, "每天"), + EqualsToken(Token::REGULAR, "走路"), + EqualsToken(Token::REGULAR, "去"), + EqualsToken(Token::REGULAR, "上班")))); // Japanese options = language_segmenter_factory::SegmenterOptions(ULOC_JAPANESE, jni_cache_.get()); @@ -227,44 +220,41 @@ TEST_F(PlainTokenizerTest, CJKT) { StringIndexingConfig::TokenizerType::PLAIN, language_segmenter.get())); if (IsCfStringTokenization()) { - EXPECT_THAT( - plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "私"), - EqualsToken(Token::Type::REGULAR, "は"), - EqualsToken(Token::Type::REGULAR, "毎日"), - EqualsToken(Token::Type::REGULAR, "仕事"), - EqualsToken(Token::Type::REGULAR, "に"), - EqualsToken(Token::Type::REGULAR, "歩い"), - EqualsToken(Token::Type::REGULAR, "て"), - EqualsToken(Token::Type::REGULAR, "い"), - EqualsToken(Token::Type::REGULAR, "ます")))); + EXPECT_THAT(plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "私"), + EqualsToken(Token::REGULAR, "は"), + EqualsToken(Token::REGULAR, "毎日"), + EqualsToken(Token::REGULAR, "仕事"), + EqualsToken(Token::REGULAR, "に"), + EqualsToken(Token::REGULAR, "歩い"), + EqualsToken(Token::REGULAR, "て"), + EqualsToken(Token::REGULAR, "い"), + EqualsToken(Token::REGULAR, "ます")))); } else { - EXPECT_THAT( - plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "私"), - EqualsToken(Token::Type::REGULAR, "は"), - EqualsToken(Token::Type::REGULAR, "毎日"), - EqualsToken(Token::Type::REGULAR, "仕事"), - EqualsToken(Token::Type::REGULAR, "に"), - EqualsToken(Token::Type::REGULAR, "歩"), - EqualsToken(Token::Type::REGULAR, "い"), - EqualsToken(Token::Type::REGULAR, "てい"), - EqualsToken(Token::Type::REGULAR, "ます")))); + EXPECT_THAT(plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "私"), + EqualsToken(Token::REGULAR, "は"), + EqualsToken(Token::REGULAR, "毎日"), + EqualsToken(Token::REGULAR, "仕事"), + EqualsToken(Token::REGULAR, "に"), + EqualsToken(Token::REGULAR, "歩"), + EqualsToken(Token::REGULAR, "い"), + EqualsToken(Token::REGULAR, "てい"), + EqualsToken(Token::REGULAR, "ます")))); } // Khmer - EXPECT_THAT( - plain_tokenizer->TokenizeAll("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "ញុំ"), - EqualsToken(Token::Type::REGULAR, "ដើរទៅ"), - EqualsToken(Token::Type::REGULAR, "ធ្វើការ"), - EqualsToken(Token::Type::REGULAR, "រាល់ថ្ងៃ")))); + EXPECT_THAT(plain_tokenizer->TokenizeAll("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "ញុំ"), + EqualsToken(Token::REGULAR, "ដើរទៅ"), + EqualsToken(Token::REGULAR, "ធ្វើការ"), + EqualsToken(Token::REGULAR, "រាល់ថ្ងៃ")))); // Korean - EXPECT_THAT(plain_tokenizer->TokenizeAll("나는 매일 출근합니다."), - IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::REGULAR, "나는"), - EqualsToken(Token::Type::REGULAR, "매일"), - EqualsToken(Token::Type::REGULAR, "출근합니다")))); + EXPECT_THAT( + plain_tokenizer->TokenizeAll("나는 매일 출근합니다."), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "나는"), + EqualsToken(Token::REGULAR, "매일"), + EqualsToken(Token::REGULAR, "출근합니다")))); // Thai // DIFFERENCE!! Disagreement over how to segment "ทุกวัน" (iOS groups). @@ -274,24 +264,23 @@ TEST_F(PlainTokenizerTest, CJKT) { std::vector<Token> tokens, plain_tokenizer->TokenizeAll("ฉันเดินไปทำงานทุกวัน")); - EXPECT_THAT(tokens, ElementsAre(EqualsToken(Token::Type::REGULAR, "ฉัน"), - EqualsToken(Token::Type::REGULAR, "เดิน"), - EqualsToken(Token::Type::REGULAR, "ไป"), - EqualsToken(Token::Type::REGULAR, "ทำงาน"), - EqualsToken(Token::Type::REGULAR, "ทุกวัน"))); + EXPECT_THAT(tokens, ElementsAre(EqualsToken(Token::REGULAR, "ฉัน"), + EqualsToken(Token::REGULAR, "เดิน"), + EqualsToken(Token::REGULAR, "ไป"), + EqualsToken(Token::REGULAR, "ทำงาน"), + EqualsToken(Token::REGULAR, "ทุกวัน"))); } else { - EXPECT_THAT( - plain_tokenizer->TokenizeAll("ฉันเดินไปทำงานทุกวัน"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "ฉัน"), - EqualsToken(Token::Type::REGULAR, "เดิน"), - EqualsToken(Token::Type::REGULAR, "ไป"), - EqualsToken(Token::Type::REGULAR, "ทำงาน"), - EqualsToken(Token::Type::REGULAR, "ทุก"), - EqualsToken(Token::Type::REGULAR, "วัน")))); + EXPECT_THAT(plain_tokenizer->TokenizeAll("ฉันเดินไปทำงานทุกวัน"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "ฉัน"), + EqualsToken(Token::REGULAR, "เดิน"), + EqualsToken(Token::REGULAR, "ไป"), + EqualsToken(Token::REGULAR, "ทำงาน"), + EqualsToken(Token::REGULAR, "ทุก"), + EqualsToken(Token::REGULAR, "วัน")))); } } -TEST_F(PlainTokenizerTest, ResetToTokenStartingAfterSimple) { +TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) { language_segmenter_factory::SegmenterOptions options(ULOC_US, jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( @@ -305,13 +294,13 @@ TEST_F(PlainTokenizerTest, ResetToTokenStartingAfterSimple) { constexpr std::string_view kText = "f b"; auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie(); - EXPECT_TRUE(iterator->ResetToTokenStartingAfter(0)); - EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "b")); + EXPECT_TRUE(iterator->ResetToTokenAfter(0)); + EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "b")); - EXPECT_FALSE(iterator->ResetToTokenStartingAfter(2)); + EXPECT_FALSE(iterator->ResetToTokenAfter(2)); } -TEST_F(PlainTokenizerTest, ResetToTokenEndingBeforeSimple) { +TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) { language_segmenter_factory::SegmenterOptions options(ULOC_US, jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( @@ -325,13 +314,13 @@ TEST_F(PlainTokenizerTest, ResetToTokenEndingBeforeSimple) { constexpr std::string_view kText = "f b"; auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie(); - EXPECT_TRUE(iterator->ResetToTokenEndingBefore(2)); - EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "f")); + EXPECT_TRUE(iterator->ResetToTokenBefore(2)); + EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "f")); - EXPECT_FALSE(iterator->ResetToTokenEndingBefore(0)); + EXPECT_FALSE(iterator->ResetToTokenBefore(0)); } -TEST_F(PlainTokenizerTest, ResetToTokenStartingAfter) { +TEST_F(PlainTokenizerTest, ResetToTokenAfter) { language_segmenter_factory::SegmenterOptions options(ULOC_US, jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( @@ -343,12 +332,11 @@ TEST_F(PlainTokenizerTest, ResetToTokenStartingAfter) { language_segmenter.get())); constexpr std::string_view kText = " foo . bar baz.. bat "; - EXPECT_THAT( - plain_tokenizer->TokenizeAll(kText), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "foo"), - EqualsToken(Token::Type::REGULAR, "bar"), - EqualsToken(Token::Type::REGULAR, "baz"), - EqualsToken(Token::Type::REGULAR, "bat")))); + EXPECT_THAT(plain_tokenizer->TokenizeAll(kText), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "foo"), + EqualsToken(Token::REGULAR, "bar"), + EqualsToken(Token::REGULAR, "baz"), + EqualsToken(Token::REGULAR, "bat")))); std::vector<std::string> expected_text = { "foo", // 0: " foo . bar" "bar", // 1: "foo . bar " @@ -371,19 +359,19 @@ TEST_F(PlainTokenizerTest, ResetToTokenStartingAfter) { auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie(); EXPECT_TRUE(iterator->Advance()); - EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "foo")); + EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "foo")); for (int i = 0; i < kText.length(); ++i) { if (i < expected_text.size()) { - EXPECT_TRUE(iterator->ResetToTokenStartingAfter(i)); + EXPECT_TRUE(iterator->ResetToTokenAfter(i)); EXPECT_THAT(iterator->GetToken(), - EqualsToken(Token::Type::REGULAR, expected_text[i])); + EqualsToken(Token::REGULAR, expected_text[i])); } else { - EXPECT_FALSE(iterator->ResetToTokenStartingAfter(i)); + EXPECT_FALSE(iterator->ResetToTokenAfter(i)); } } } -TEST_F(PlainTokenizerTest, ResetToTokenEndingBefore) { +TEST_F(PlainTokenizerTest, ResetToTokenBefore) { language_segmenter_factory::SegmenterOptions options(ULOC_US, jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( @@ -395,12 +383,11 @@ TEST_F(PlainTokenizerTest, ResetToTokenEndingBefore) { language_segmenter.get())); constexpr std::string_view kText = " foo . bar baz.. bat "; - EXPECT_THAT( - plain_tokenizer->TokenizeAll(kText), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "foo"), - EqualsToken(Token::Type::REGULAR, "bar"), - EqualsToken(Token::Type::REGULAR, "baz"), - EqualsToken(Token::Type::REGULAR, "bat")))); + EXPECT_THAT(plain_tokenizer->TokenizeAll(kText), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "foo"), + EqualsToken(Token::REGULAR, "bar"), + EqualsToken(Token::REGULAR, "baz"), + EqualsToken(Token::REGULAR, "bat")))); std::vector<std::string> expected_text = { "bat", // 20: "baz.. bat " "baz", // 19: " baz.. bat" @@ -423,16 +410,15 @@ TEST_F(PlainTokenizerTest, ResetToTokenEndingBefore) { auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie(); EXPECT_TRUE(iterator->Advance()); - EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "foo")); + EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "foo")); for (int i = kText.length() - 1; i >= 0; --i) { int expected_index = kText.length() - 1 - i; if (expected_index < expected_text.size()) { - EXPECT_TRUE(iterator->ResetToTokenEndingBefore(i)); - EXPECT_THAT( - iterator->GetToken(), - EqualsToken(Token::Type::REGULAR, expected_text[expected_index])); + EXPECT_TRUE(iterator->ResetToTokenBefore(i)); + EXPECT_THAT(iterator->GetToken(), + EqualsToken(Token::REGULAR, expected_text[expected_index])); } else { - EXPECT_FALSE(iterator->ResetToTokenEndingBefore(i)); + EXPECT_FALSE(iterator->ResetToTokenBefore(i)); } } } diff --git a/icing/tokenization/raw-query-tokenizer.cc b/icing/tokenization/raw-query-tokenizer.cc index ff449a7..205d3a2 100644 --- a/icing/tokenization/raw-query-tokenizer.cc +++ b/icing/tokenization/raw-query-tokenizer.cc @@ -14,8 +14,9 @@ #include "icing/tokenization/raw-query-tokenizer.h" +#include <stddef.h> + #include <cctype> -#include <cstddef> #include <memory> #include <string> #include <string_view> @@ -102,7 +103,7 @@ enum State { // When seeing right parentheses CLOSING_PARENTHESES = 8, - PROCESSING_NON_ASCII_ALPHANUMERIC_TERM = 9, + PROCESSING_NON_ASCII_ALPHABETIC_TERM = 9, PROCESSING_PROPERTY_TERM_APPENDING = 10, @@ -119,7 +120,7 @@ enum TermType { // A term that consists of unicode alphabetic and numeric characters ASCII_ALPHANUMERIC_TERM = 1, - NON_ASCII_ALPHANUMERIC_TERM = 2, + NON_ASCII_ALPHABETIC_TERM = 2, // "(" LEFT_PARENTHESES = 3, @@ -208,7 +209,7 @@ std::string_view GetErrorMessage(ActionOrError maybe_error) { // PROCESSING_OR = 6 // OPENING_PARENTHESES = 7 // CLOSING_PARENTHESES = 8 -// PROCESSING_NON_ASCII_ALPHANUMERIC_TERM = 9 +// PROCESSING_NON_ASCII_ALPHABETIC_TERM = 9 // PROCESSING_PROPERTY_TERM_APPENDING = 10 // // Actions: @@ -252,40 +253,40 @@ std::string_view GetErrorMessage(ActionOrError maybe_error) { // like "+", "&", "@", "#" in indexing and query tokenizers. constexpr State state_transition_rules[STATE_COUNT][TYPE_COUNT] = { /*State: Ready*/ - {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM, + {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM, OPENING_PARENTHESES, CLOSING_PARENTHESES, PROCESSING_EXCLUSION, PROCESSING_OR, READY, READY}, /*State: PROCESSING_ALPHANUMERIC_TERM*/ - {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM, + {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM, OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID, PROCESSING_PROPERTY_RESTRICT, READY}, /*State: PROCESSING_EXCLUSION*/ {READY, PROCESSING_EXCLUSION_TERM, PROCESSING_EXCLUSION_TERM, INVALID, CLOSING_PARENTHESES, PROCESSING_EXCLUSION, INVALID, INVALID, READY}, /*State: PROCESSING_EXCLUSION_TERM*/ - {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM, + {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM, OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID, INVALID, READY}, /*State: PROCESSING_PROPERTY_RESTRICT*/ {READY, PROCESSING_PROPERTY_TERM, PROCESSING_PROPERTY_TERM, INVALID, CLOSING_PARENTHESES, INVALID, INVALID, PROCESSING_PROPERTY_RESTRICT, READY}, /*State: PROCESSING_PROPERTY_TERM*/ - {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM, + {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM, OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID, PROCESSING_PROPERTY_TERM_APPENDING, READY}, /*State: PROCESSING_OR*/ {READY, INVALID, INVALID, OPENING_PARENTHESES, CLOSING_PARENTHESES, INVALID, INVALID, INVALID, READY}, /*State: OPENING_PARENTHESES*/ - {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM, + {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM, OPENING_PARENTHESES, CLOSING_PARENTHESES, PROCESSING_EXCLUSION, OPENING_PARENTHESES, READY, READY}, /*State: CLOSING_PARENTHESES*/ - {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM, + {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM, OPENING_PARENTHESES, CLOSING_PARENTHESES, PROCESSING_EXCLUSION, PROCESSING_OR, INVALID, READY}, - /*State: PROCESSING_NON_ASCII_ALPHANUMERIC_TERM*/ - {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM, + /*State: PROCESSING_NON_ASCII_ALPHABETIC_TERM*/ + {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM, OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID, INVALID, READY}, /*State: PROCESSING_PROPERTY_TERM_APPENDING*/ {READY, PROCESSING_PROPERTY_TERM_APPENDING, @@ -326,7 +327,7 @@ constexpr ActionOrError action_rules[STATE_COUNT][TYPE_COUNT] = { /*State: CLOSING_PARENTHESES*/ {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, ERROR_GROUP_AS_PROPERTY_NAME, OUTPUT}, - /*State: PROCESSING_NON_ASCII_ALPHANUMERIC_TERM*/ + /*State: PROCESSING_NON_ASCII_ALPHABETIC_TERM*/ {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, ERROR_NO_WHITESPACE_AROUND_OR, ERROR_NON_ASCII_AS_PROPERTY_NAME, OUTPUT}, /*State: PROCESSING_PROPERTY_TERM_APPENDING*/ @@ -345,40 +346,6 @@ std::pair<TermType, std::string_view> GetWhitespaceTerm(std::string_view text, return std::make_pair(WHITESPACE, text.substr(pos, cur - pos)); } -TermType GetContentTermType(std::string_view text, size_t pos) { - if (i18n_utils::IsPunctuationAt(text, pos)) { - return OTHER; - } else if (i18n_utils::IsAscii(text[pos])) { - return ASCII_ALPHANUMERIC_TERM; - } - return NON_ASCII_ALPHANUMERIC_TERM; -} - -bool IsContentTermType(TermType term_type) { - switch (term_type) { - case ASCII_ALPHANUMERIC_TERM: - [[fallthrough]]; - case NON_ASCII_ALPHANUMERIC_TERM: - [[fallthrough]]; - case OTHER: - return true; - case WHITESPACE: - [[fallthrough]]; - case LEFT_PARENTHESES: - [[fallthrough]]; - case RIGHT_PARENTHESES: - [[fallthrough]]; - case EXCLUSION_OPERATOR: - [[fallthrough]]; - case OR_OPERATOR: - [[fallthrough]]; - case COLON: - [[fallthrough]]; - case TYPE_COUNT: - return false; - } -} - // Determines the length of the potential content term beginning at text[pos] // and returns a pair with the appropriate TermType and a string_view of the // content term. @@ -391,7 +358,12 @@ std::pair<TermType, std::string_view> GetContentTerm(std::string_view text, size_t pos) { size_t len = 0; // Checks the first char to see if it's an ASCII term - TermType type = GetContentTermType(text, pos); + TermType type = ASCII_ALPHANUMERIC_TERM; + if (!i18n_utils::IsAscii(text[pos])) { + type = NON_ASCII_ALPHABETIC_TERM; + } else if (std::isalnum(text[pos])) { + type = OTHER; + } for (size_t cur = pos; cur < text.length() && len == 0; ++cur) { switch (text[cur]) { case kLeftParentheses: @@ -451,7 +423,7 @@ std::pair<TermType, std::string_view> GetTerm(std::string_view text, // and [(cat OR)]. This helps assert extra rule 3: "OR" is ignored if there's no // valid token on its right. void RemoveLastTokenIfOrOperator(std::vector<Token>* tokens) { - if (!tokens->empty() && tokens->back().type == Token::Type::QUERY_OR) { + if (!tokens->empty() && tokens->back().type == Token::QUERY_OR) { tokens->pop_back(); } } @@ -465,11 +437,11 @@ libtextclassifier3::Status OutputOrOperatorToken(std::vector<Token>* tokens) { } Token::Type last_token_type = tokens->back().type; switch (last_token_type) { - case Token::Type::REGULAR: - case Token::Type::QUERY_RIGHT_PARENTHESES: - tokens->emplace_back(Token::Type::QUERY_OR); + case Token::REGULAR: + case Token::QUERY_RIGHT_PARENTHESES: + tokens->emplace_back(Token::QUERY_OR); break; - case Token::Type::QUERY_OR: + case Token::QUERY_OR: // Ignores "OR" because there's already an "OR", e.g. "term1 OR OR term2" break; default: @@ -499,7 +471,7 @@ libtextclassifier3::Status OutputToken(State new_state, switch (current_term_type) { case ASCII_ALPHANUMERIC_TERM: [[fallthrough]]; - case NON_ASCII_ALPHANUMERIC_TERM: + case NON_ASCII_ALPHABETIC_TERM: if (new_state == PROCESSING_PROPERTY_TERM) { // Asserts extra rule 1: each property name in the property path is a // valid term. @@ -510,21 +482,21 @@ libtextclassifier3::Status OutputToken(State new_state, GetErrorMessage(ERROR_NON_ASCII_AS_PROPERTY_NAME)); } } - tokens->emplace_back(Token::Type::QUERY_PROPERTY, current_term); + tokens->emplace_back(Token::QUERY_PROPERTY, current_term); } else { - tokens->emplace_back(Token::Type::REGULAR, current_term); + tokens->emplace_back(Token::REGULAR, current_term); } break; case LEFT_PARENTHESES: - tokens->emplace_back(Token::Type::QUERY_LEFT_PARENTHESES); + tokens->emplace_back(Token::QUERY_LEFT_PARENTHESES); break; case RIGHT_PARENTHESES: // Ignores "OR" if it's followed by right parentheses. RemoveLastTokenIfOrOperator(tokens); - tokens->emplace_back(Token::Type::QUERY_RIGHT_PARENTHESES); + tokens->emplace_back(Token::QUERY_RIGHT_PARENTHESES); break; case EXCLUSION_OPERATOR: - tokens->emplace_back(Token::Type::QUERY_EXCLUSION); + tokens->emplace_back(Token::QUERY_EXCLUSION); break; case OR_OPERATOR: return OutputOrOperatorToken(tokens); @@ -569,8 +541,10 @@ libtextclassifier3::Status ProcessTerm( ICING_ASSIGN_OR_RETURN(std::vector<std::string_view> content_terms, language_segmenter->GetAllTerms(*current_term)); for (std::string_view term : content_terms) { - TermType type = GetContentTermType(term, 0); - if (type == OTHER) { + TermType type = ASCII_ALPHANUMERIC_TERM; + if (!i18n_utils::IsAscii(term[0])) { + type = NON_ASCII_ALPHABETIC_TERM; + } else if (!std::isalnum(term[0])) { // Skip OTHER tokens here. continue; } @@ -616,7 +590,9 @@ libtextclassifier3::StatusOr<std::vector<Token>> ProcessTerms( for (int i = 0; i < prescanned_terms.size(); ++i) { const std::pair<TermType, std::string_view>& prescanned_term = prescanned_terms.at(i); - if (!IsContentTermType(prescanned_term.first)) { + if (prescanned_term.first != ASCII_ALPHANUMERIC_TERM && + prescanned_term.first != NON_ASCII_ALPHABETIC_TERM && + prescanned_term.first != OTHER) { // This can't be a property restrict. Just pass it in. ICING_RETURN_IF_ERROR( ProcessTerm(¤t_state, ¤t_term, ¤t_term_type, @@ -628,15 +604,18 @@ libtextclassifier3::StatusOr<std::vector<Token>> ProcessTerms( std::vector<std::string_view> content_terms, language_segmenter->GetAllTerms(prescanned_term.second)); for (std::string_view term : content_terms) { - TermType type = GetContentTermType(term, 0); + TermType type = ASCII_ALPHANUMERIC_TERM; if (term == kOrOperator) { // TODO(tjbarron) Decide whether we should revise this and other // handled syntax. This is used to allow queries like "term1,OR,term2" // to succeed. It's not clear if we should allow this or require // clients to ensure that OR operators are always surrounded by // whitespace. - // Override the type if this is actually an OR operator. type = OR_OPERATOR; + } else if (!i18n_utils::IsAscii(term[0])) { + type = NON_ASCII_ALPHABETIC_TERM; + } else if (!std::isalnum(term[0])) { + type = OTHER; } ICING_RETURN_IF_ERROR(ProcessTerm(¤t_state, ¤t_term, ¤t_term_type, @@ -670,7 +649,7 @@ class RawQueryTokenIterator : public Tokenizer::Iterator { Token GetToken() const override { if (current_ < 0 || current_ >= tokens_.size()) { - return Token(Token::Type::INVALID); + return Token(Token::INVALID); } return tokens_.at(current_); } diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc index b1dcc73..500efa0 100644 --- a/icing/tokenization/raw-query-tokenizer_test.cc +++ b/icing/tokenization/raw-query-tokenizer_test.cc @@ -16,9 +16,9 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/platform.h" #include "icing/testing/common-matchers.h" -#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/test-data.h" #include "icing/tokenization/language-segmenter-factory.h" #include "icing/tokenization/tokenizer-factory.h" @@ -59,38 +59,13 @@ TEST_F(RawQueryTokenizerTest, Simple) { tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, language_segmenter.get())); - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("Hello World!"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"), - EqualsToken(Token::Type::REGULAR, "World")))); - - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("hElLo WORLD"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "hElLo"), - EqualsToken(Token::Type::REGULAR, "WORLD")))); -} - -TEST_F(RawQueryTokenizerTest, Emoji) { - language_segmenter_factory::SegmenterOptions options(ULOC_US); - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(std::move(options))); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Tokenizer> raw_query_tokenizer, - tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, - language_segmenter.get())); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("Hello World!"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"), + EqualsToken(Token::REGULAR, "World")))); - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("😊 Hello! Goodbye?"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "😊"), - EqualsToken(Token::Type::REGULAR, "Hello"), - EqualsToken(Token::Type::REGULAR, "Goodbye")))); - - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("Hello😊 ! Goodbye?"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"), - EqualsToken(Token::Type::REGULAR, "😊"), - EqualsToken(Token::Type::REGULAR, "Goodbye")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("hElLo WORLD"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "hElLo"), + EqualsToken(Token::REGULAR, "WORLD")))); } TEST_F(RawQueryTokenizerTest, Parentheses) { @@ -103,96 +78,84 @@ TEST_F(RawQueryTokenizerTest, Parentheses) { tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, language_segmenter.get())); - ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens, - raw_query_tokenizer->TokenizeAll("()")); - EXPECT_THAT( - query_tokens, - ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("()"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); - ICING_ASSERT_OK_AND_ASSIGN(query_tokens, - raw_query_tokenizer->TokenizeAll("( )")); - EXPECT_THAT( - query_tokens, - ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( )"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); - ICING_ASSERT_OK_AND_ASSIGN(query_tokens, - raw_query_tokenizer->TokenizeAll("(term1 term2)")); - EXPECT_THAT( - query_tokens, - ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::REGULAR, "term2"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 term2)"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::REGULAR, "term2"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("((term1 term2) (term3 term4))"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::REGULAR, "term2"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "term3"), + EqualsToken(Token::REGULAR, "term4"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1(term2)"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "term2"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); - ICING_ASSERT_OK_AND_ASSIGN( - query_tokens, - raw_query_tokenizer->TokenizeAll("((term1 term2) (term3 term4))")); - EXPECT_THAT( - query_tokens, - ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::REGULAR, "term2"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term3"), - EqualsToken(Token::Type::REGULAR, "term4"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))); - - ICING_ASSERT_OK_AND_ASSIGN(query_tokens, - raw_query_tokenizer->TokenizeAll("term1(term2)")); EXPECT_THAT( - query_tokens, - ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term2"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))); - - ICING_ASSERT_OK_AND_ASSIGN(query_tokens, - raw_query_tokenizer->TokenizeAll("(term1)term2")); - EXPECT_THAT(query_tokens, - ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term2"))); + raw_query_tokenizer->TokenizeAll("(term1)term2"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "term2")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)(term2)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term2"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); - - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)-term2"), - IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::Type::QUERY_EXCLUSION, ""), - EqualsToken(Token::Type::REGULAR, "term2")))); + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "term2"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)OR term2"), - IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::Type::QUERY_OR, ""), - EqualsToken(Token::Type::REGULAR, "term2")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("(term1)-term2"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::QUERY_EXCLUSION, ""), + EqualsToken(Token::REGULAR, "term2")))); + + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("(term1)OR term2"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::QUERY_OR, ""), + EqualsToken(Token::REGULAR, "term2")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)OR(term2)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::Type::QUERY_OR, ""), - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term2"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::QUERY_OR, ""), + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "term2"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1):term2"), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT, @@ -217,49 +180,44 @@ TEST_F(RawQueryTokenizerTest, Exclustion) { tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, language_segmenter.get())); - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("-term1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""), - EqualsToken(Token::Type::REGULAR, "term1")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""), + EqualsToken(Token::REGULAR, "term1")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(-term1)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::QUERY_EXCLUSION, ""), - EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::QUERY_EXCLUSION, ""), + EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); // Exclusion operator is ignored - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("- term1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("- term1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1")))); // Exclusion operator is ignored - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("term1- term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::REGULAR, "term2")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1- term2"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::REGULAR, "term2")))); // Exclusion operator is ignored EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 -)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); // First exclusion operator is ignored - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("--term1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""), - EqualsToken(Token::Type::REGULAR, "term1")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("--term1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""), + EqualsToken(Token::REGULAR, "term1")))); // First "-" is exclusion operator, second is not and will be discarded. // In other words, exclusion only applies to the term right after it. - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("-term1-term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""), - EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::REGULAR, "term2")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1-term2"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""), + EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::REGULAR, "term2")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-(term1)"), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT, @@ -291,75 +249,73 @@ TEST_F(RawQueryTokenizerTest, PropertyRestriction) { tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, language_segmenter.get())); - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1"), - IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), - EqualsToken(Token::Type::REGULAR, "term1")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("property1:term1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), + EqualsToken(Token::REGULAR, "term1")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(property1:term1)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), - EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::QUERY_PROPERTY, "property1"), + EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); // Colon is ignored - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll(":term1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll(":term1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1")))); // Colon is ignored EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(:term1)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); // Colon is ignored - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("term1:"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1:"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1")))); // property name can be a path EXPECT_THAT(raw_query_tokenizer->TokenizeAll("email.title:hello"), - IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_PROPERTY, "email.title"), - EqualsToken(Token::Type::REGULAR, "hello")))); + IsOkAndHolds( + ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "email.title"), + EqualsToken(Token::REGULAR, "hello")))); // The first colon ":" triggers property restriction, the second colon is used // as a word connector per ICU's rule // (https://unicode.org/reports/tr29/#Word_Boundaries). - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property:foo:bar"), - IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_PROPERTY, "property"), - EqualsToken(Token::Type::REGULAR, "foo:bar")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("property:foo:bar"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property"), + EqualsToken(Token::REGULAR, "foo:bar")))); // Property restriction only applies to the term right after it. // Note: "term1:term2" is not a term but 2 terms because word connectors // don't apply to numbers and alphabets. - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1:term2"), - IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), - EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::REGULAR, "term2")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("property1:term1:term2"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), + EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::REGULAR, "term2")))); - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:今天:天气"), - IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), - EqualsToken(Token::Type::REGULAR, "今天"), - EqualsToken(Token::Type::REGULAR, "天气")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("property1:今天:天气"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), + EqualsToken(Token::REGULAR, "今天"), + EqualsToken(Token::REGULAR, "天气")))); - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1-"), - IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), - EqualsToken(Token::Type::REGULAR, "term1")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("property1:term1-"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), + EqualsToken(Token::REGULAR, "term1")))); // Multiple continuous colons will still be recognized as a property // restriction operator - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1::term1"), - IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), - EqualsToken(Token::Type::REGULAR, "term1")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("property1::term1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), + EqualsToken(Token::REGULAR, "term1")))); EXPECT_THAT( raw_query_tokenizer->TokenizeAll("property1:(term1)"), @@ -389,109 +345,105 @@ TEST_F(RawQueryTokenizerTest, OR) { tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, language_segmenter.get())); - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("term1 OR term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::QUERY_OR, ""), - EqualsToken(Token::Type::REGULAR, "term2")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR term2"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::QUERY_OR, ""), + EqualsToken(Token::REGULAR, "term2")))); // Two continuous "OR"s are treated as one - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("term1 OR OR term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::QUERY_OR, ""), - EqualsToken(Token::Type::REGULAR, "term2")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR OR term2"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::QUERY_OR, ""), + EqualsToken(Token::REGULAR, "term2")))); - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1) OR term2"), - IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::Type::QUERY_OR, ""), - EqualsToken(Token::Type::REGULAR, "term2")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("(term1) OR term2"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::QUERY_OR, ""), + EqualsToken(Token::REGULAR, "term2")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR (term2)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::QUERY_OR, ""), - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term2"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::QUERY_OR, ""), + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "term2"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("((term1) OR (term2))"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::Type::QUERY_OR, ""), - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term2"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::QUERY_OR, ""), + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "term2"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); // Only "OR" (all in uppercase) is the operator EXPECT_THAT( raw_query_tokenizer->TokenizeAll("term1 or term2 Or term3 oR term4"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::REGULAR, "or"), - EqualsToken(Token::Type::REGULAR, "term2"), - EqualsToken(Token::Type::REGULAR, "Or"), - EqualsToken(Token::Type::REGULAR, "term3"), - EqualsToken(Token::Type::REGULAR, "oR"), - EqualsToken(Token::Type::REGULAR, "term4")))); + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::REGULAR, "or"), + EqualsToken(Token::REGULAR, "term2"), + EqualsToken(Token::REGULAR, "Or"), + EqualsToken(Token::REGULAR, "term3"), + EqualsToken(Token::REGULAR, "oR"), + EqualsToken(Token::REGULAR, "term4")))); // "OR" is ignored - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("OR term1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("OR term1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1")))); // "OR" is ignored - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("term1 OR"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1")))); // "OR" is ignored EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(OR term1)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); // "OR" is ignored EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( OR term1)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); // "OR" is ignored EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 OR)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); // "OR" is ignored EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 OR )"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); // "OR" is ignored EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( OR )"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR(term2)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::QUERY_OR, ""), - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term2"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::QUERY_OR, ""), + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "term2"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); EXPECT_THAT( raw_query_tokenizer->TokenizeAll("term1 OR-term2"), @@ -520,31 +472,31 @@ TEST_F(RawQueryTokenizerTest, CJKT) { if (IsCfStringTokenization()) { EXPECT_THAT( raw_query_tokenizer->TokenizeAll("-今天天气很好"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""), - EqualsToken(Token::Type::REGULAR, "今天"), - EqualsToken(Token::Type::REGULAR, "天气"), - EqualsToken(Token::Type::REGULAR, "很"), - EqualsToken(Token::Type::REGULAR, "好")))); + IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""), + EqualsToken(Token::REGULAR, "今天"), + EqualsToken(Token::REGULAR, "天气"), + EqualsToken(Token::REGULAR, "很"), + EqualsToken(Token::REGULAR, "好")))); } else { EXPECT_THAT( raw_query_tokenizer->TokenizeAll("-今天天气很好"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""), - EqualsToken(Token::Type::REGULAR, "今天"), - EqualsToken(Token::Type::REGULAR, "天气"), - EqualsToken(Token::Type::REGULAR, "很好")))); + IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""), + EqualsToken(Token::REGULAR, "今天"), + EqualsToken(Token::REGULAR, "天气"), + EqualsToken(Token::REGULAR, "很好")))); } if (IsCfStringTokenization()) { EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:你好"), - IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), - EqualsToken(Token::Type::REGULAR, "你"), - EqualsToken(Token::Type::REGULAR, "好")))); + IsOkAndHolds( + ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), + EqualsToken(Token::REGULAR, "你"), + EqualsToken(Token::REGULAR, "好")))); } else { EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:你好"), - IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), - EqualsToken(Token::Type::REGULAR, "你好")))); + IsOkAndHolds( + ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), + EqualsToken(Token::REGULAR, "你好")))); } EXPECT_THAT( @@ -552,11 +504,10 @@ TEST_F(RawQueryTokenizerTest, CJKT) { StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT, HasSubstr("Characters in property name must all be ASCII"))); - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("cat OR ねこ"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "cat"), - EqualsToken(Token::Type::QUERY_OR, ""), - EqualsToken(Token::Type::REGULAR, "ねこ")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("cat OR ねこ"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "cat"), + EqualsToken(Token::QUERY_OR, ""), + EqualsToken(Token::REGULAR, "ねこ")))); EXPECT_THAT( raw_query_tokenizer->TokenizeAll("cat ORねこ"), @@ -592,45 +543,40 @@ TEST_F(RawQueryTokenizerTest, OtherChars) { language_segmenter.get())); // Comma is ignored - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll(",term1, ,"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll(",term1, ,"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(,term1),"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); // Exclusion operator and comma are ignored - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("-,term1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-,term1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1")))); - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("-term1,"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""), - EqualsToken(Token::Type::REGULAR, "term1")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1,"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""), + EqualsToken(Token::REGULAR, "term1")))); // Colon and comma are ignored - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("property1:,term1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "property1"), - EqualsToken(Token::Type::REGULAR, "term1")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:,term1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "property1"), + EqualsToken(Token::REGULAR, "term1")))); - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1,term2"), - IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), - EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::REGULAR, "term2")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("property1:term1,term2"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), + EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::REGULAR, "term2")))); // This is a special case for OR, unknown chars are treated the same as // whitespaces before and after OR. - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("term1,OR,term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::QUERY_OR, ""), - EqualsToken(Token::Type::REGULAR, "term2")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1,OR,term2"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"), + EqualsToken(Token::QUERY_OR, ""), + EqualsToken(Token::REGULAR, "term2")))); } TEST_F(RawQueryTokenizerTest, Mix) { @@ -647,38 +593,37 @@ TEST_F(RawQueryTokenizerTest, Mix) { EXPECT_THAT(raw_query_tokenizer->TokenizeAll( "こんにちはgood afternoon, title:今天 OR (ในวันนี้ -B12)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::REGULAR, "こんにちは"), - EqualsToken(Token::Type::REGULAR, "good"), - EqualsToken(Token::Type::REGULAR, "afternoon"), - EqualsToken(Token::Type::QUERY_PROPERTY, "title"), - EqualsToken(Token::Type::REGULAR, "今天"), - EqualsToken(Token::Type::QUERY_OR, ""), - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "ใน"), - EqualsToken(Token::Type::REGULAR, "วันนี้"), - EqualsToken(Token::Type::QUERY_EXCLUSION, ""), - EqualsToken(Token::Type::REGULAR, "B12"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::REGULAR, "こんにちは"), + EqualsToken(Token::REGULAR, "good"), + EqualsToken(Token::REGULAR, "afternoon"), + EqualsToken(Token::QUERY_PROPERTY, "title"), + EqualsToken(Token::REGULAR, "今天"), + EqualsToken(Token::QUERY_OR, ""), + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "ใน"), + EqualsToken(Token::REGULAR, "วันนี้"), + EqualsToken(Token::QUERY_EXCLUSION, ""), + EqualsToken(Token::REGULAR, "B12"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); } else { ICING_ASSERT_OK_AND_ASSIGN( std::vector<Token> tokens, raw_query_tokenizer->TokenizeAll( "こんにちはgood afternoon, title:今天 OR (ในวันนี้ -B12)")); - EXPECT_THAT( - tokens, - ElementsAre(EqualsToken(Token::Type::REGULAR, "こんにちは"), - EqualsToken(Token::Type::REGULAR, "good"), - EqualsToken(Token::Type::REGULAR, "afternoon"), - EqualsToken(Token::Type::QUERY_PROPERTY, "title"), - EqualsToken(Token::Type::REGULAR, "今天"), - EqualsToken(Token::Type::QUERY_OR, ""), - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "ใน"), - EqualsToken(Token::Type::REGULAR, "วัน"), - EqualsToken(Token::Type::REGULAR, "นี้"), - EqualsToken(Token::Type::QUERY_EXCLUSION, ""), - EqualsToken(Token::Type::REGULAR, "B12"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))); + EXPECT_THAT(tokens, + ElementsAre(EqualsToken(Token::REGULAR, "こんにちは"), + EqualsToken(Token::REGULAR, "good"), + EqualsToken(Token::REGULAR, "afternoon"), + EqualsToken(Token::QUERY_PROPERTY, "title"), + EqualsToken(Token::REGULAR, "今天"), + EqualsToken(Token::QUERY_OR, ""), + EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::REGULAR, "ใน"), + EqualsToken(Token::REGULAR, "วัน"), + EqualsToken(Token::REGULAR, "นี้"), + EqualsToken(Token::QUERY_EXCLUSION, ""), + EqualsToken(Token::REGULAR, "B12"), + EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))); } } diff --git a/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc index 8e1e563..6b1cb3a 100644 --- a/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc +++ b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc @@ -15,10 +15,10 @@ #include "icing/tokenization/reverse_jni/reverse-jni-break-iterator.h" #include <jni.h> +#include <math.h> #include <cassert> #include <cctype> -#include <cmath> #include <map> #include "icing/jni/jni-cache.h" diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc index e5de6e6..76219b5 100644 --- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc @@ -51,9 +51,9 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { if (term_end_exclusive_.utf16_index() == 0) { int first = break_iterator_->First(); if (!term_start_.MoveToUtf16(first)) { - // First is guaranteed to succeed and return a position within bonds. - // So the only possible failure could be an invalid sequence. Mark as - // DONE and return. + // First is guaranteed to succeed and return a position within bonds. So + // the only possible failure could be an invalid sequence. Mark as DONE + // and return. MarkAsDone(); return false; } @@ -74,7 +74,14 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { MarkAsDone(); return false; } - return true; + + // Check if the current term is valid. We consider any term valid if its + // first character is valid. If it's not valid, then we need to advance to + // the next term. + if (IsValidTerm()) { + return true; + } + return Advance(); } // Returns the current term. It can be called only when Advance() returns @@ -237,7 +244,7 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { // 4. The start and end indices point to a segment, but we need to ensure // that this segment is 1) valid and 2) ends before offset. Otherwise, we'll // need a segment prior to this one. - if (term_end_exclusive_.utf32_index() > offset) { + if (term_end_exclusive_.utf32_index() > offset || !IsValidTerm()) { return ResetToTermEndingBeforeUtf32(term_start_.utf32_index()); } return term_start_.utf32_index(); @@ -277,6 +284,21 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { return term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone; } + bool IsValidTerm() const { + // Rule 1: all ASCII terms will be returned. + // We know it's a ASCII term by checking the first char. + if (i18n_utils::IsAscii(text_[term_start_.utf8_index()])) { + return true; + } + + // Rule 2: for non-ASCII terms, only the alphabetic terms are returned. + // We know it's an alphabetic term by checking the first unicode character. + if (i18n_utils::IsAlphabeticAt(text_, term_start_.utf8_index())) { + return true; + } + return false; + } + // All of ReverseJniBreakIterator's functions return UTF-16 boundaries. So // this class needs to maintain state to convert between UTF-16 and UTF-8. std::unique_ptr<ReverseJniBreakIterator> break_iterator_; diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc index 277ece6..b1a8f72 100644 --- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc @@ -185,7 +185,7 @@ TEST_P(ReverseJniLanguageSegmenterTest, Non_ASCII_Non_Alphabetic) { // Full-width (non-ASCII) punctuation marks and special characters are left // out. EXPECT_THAT(language_segmenter->GetAllTerms("。?·Hello!×"), - IsOkAndHolds(ElementsAre("。", "?", "·", "Hello", "!", "×"))); + IsOkAndHolds(ElementsAre("Hello"))); } TEST_P(ReverseJniLanguageSegmenterTest, Acronym) { @@ -246,9 +246,9 @@ TEST_P(ReverseJniLanguageSegmenterTest, WordConnector) { // Connectors don't connect if one side is an invalid term (?) EXPECT_THAT(language_segmenter->GetAllTerms("bar:baz:?"), - IsOkAndHolds(ElementsAre("bar:baz", ":", "?"))); + IsOkAndHolds(ElementsAre("bar:baz", ":"))); EXPECT_THAT(language_segmenter->GetAllTerms("?:bar:baz"), - IsOkAndHolds(ElementsAre("?", ":", "bar:baz"))); + IsOkAndHolds(ElementsAre(":", "bar:baz"))); EXPECT_THAT(language_segmenter->GetAllTerms("3:14"), IsOkAndHolds(ElementsAre("3", ":", "14"))); EXPECT_THAT(language_segmenter->GetAllTerms("私:は"), @@ -366,17 +366,6 @@ TEST_P(ReverseJniLanguageSegmenterTest, Number) { IsOkAndHolds(ElementsAre("-", "123"))); } -TEST_P(ReverseJniLanguageSegmenterTest, FullWidthNumbers) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create( - GetSegmenterOptions(GetLocale(), jni_cache_.get()))); - - EXPECT_THAT(language_segmenter->GetAllTerms("0123456789"), - IsOkAndHolds(ElementsAre("0", "1", "2", "3", "4", "5", "6", - "7", "8", "9"))); -} - TEST_P(ReverseJniLanguageSegmenterTest, ContinuousWhitespaces) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, @@ -413,17 +402,15 @@ TEST_P(ReverseJniLanguageSegmenterTest, CJKT) { // have whitespaces as word delimiter. // Chinese - EXPECT_THAT( - language_segmenter->GetAllTerms("我每天走路去上班。"), - IsOkAndHolds(ElementsAre("我", "每天", "走路", "去", "上班", "。"))); + EXPECT_THAT(language_segmenter->GetAllTerms("我每天走路去上班。"), + IsOkAndHolds(ElementsAre("我", "每天", "走路", "去", "上班"))); // Japanese EXPECT_THAT(language_segmenter->GetAllTerms("私は毎日仕事に歩いています。"), IsOkAndHolds(ElementsAre("私", "は", "毎日", "仕事", "に", "歩", - "い", "てい", "ます", "。"))); + "い", "てい", "ます"))); // Khmer EXPECT_THAT(language_segmenter->GetAllTerms("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"), - IsOkAndHolds(ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ", "។"))); - + IsOkAndHolds(ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ"))); // Thai EXPECT_THAT( language_segmenter->GetAllTerms("ฉันเดินไปทำงานทุกวัน"), @@ -854,19 +841,16 @@ TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermAfterUtf32) { ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kChinese)); // String: "我每天走路去上班。" - // ^ ^ ^ ^^ ^ - // UTF-8 idx: 0 3 9 15 18 24 - // UTF-832 idx: 0 1 3 5 6 8 + // ^ ^ ^ ^^ + // UTF-8 idx: 0 3 9 15 18 + // UTF-832 idx: 0 1 3 5 6 EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1))); EXPECT_THAT(itr->GetTerm(), Eq("每天")); EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3))); EXPECT_THAT(itr->GetTerm(), Eq("走路")); - EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8))); - EXPECT_THAT(itr->GetTerm(), Eq("。")); - - EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(8), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); } @@ -881,21 +865,18 @@ TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermAfterUtf32) { ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kJapanese)); // String: "私は毎日仕事に歩いています。" - // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ - // UTF-8 idx: 0 3 6 12 18212427 33 39 - // UTF-32 idx: 0 1 2 4 6 7 8 9 11 13 + // ^ ^ ^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 3 6 12 18212427 33 + // UTF-32 idx: 0 1 2 4 6 7 8 9 11 EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1))); EXPECT_THAT(itr->GetTerm(), Eq("は")); - EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(13), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(11), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(3), IsOkAndHolds(Eq(4))); EXPECT_THAT(itr->GetTerm(), Eq("仕事")); - - EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(13))); - EXPECT_THAT(itr->GetTerm(), Eq("。")); } TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfterUtf32) { @@ -907,16 +888,13 @@ TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfterUtf32) { ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kKhmer)); // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។" - // ^ ^ ^ ^ ^ - // UTF-8 idx: 0 9 24 45 69 - // UTF-32 idx: 0 3 8 15 23 + // ^ ^ ^ ^ + // UTF-8 idx: 0 9 24 45 + // UTF-32 idx: 0 3 8 15 EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3))); EXPECT_THAT(itr->GetTerm(), Eq("ដើរទៅ")); - EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15), IsOkAndHolds(Eq(23))); - EXPECT_THAT(itr->GetTerm(), Eq("។")); - - EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(23), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); diff --git a/icing/tokenization/token.h b/icing/tokenization/token.h index 0c268be..dda9efc 100644 --- a/icing/tokenization/token.h +++ b/icing/tokenization/token.h @@ -21,14 +21,11 @@ namespace icing { namespace lib { struct Token { - enum class Type { + enum Type { // Common types REGULAR, // A token without special meanings, the value of it will be // indexed or searched directly - VERBATIM, // A token that should be indexed and searched without any - // modifications to the raw text - // Types only used in raw query QUERY_OR, // Indicates OR logic between its left and right tokens QUERY_EXCLUSION, // Indicates exclusion operation on next token diff --git a/icing/tokenization/tokenizer-factory.cc b/icing/tokenization/tokenizer-factory.cc index b2508f7..9b59acf 100644 --- a/icing/tokenization/tokenizer-factory.cc +++ b/icing/tokenization/tokenizer-factory.cc @@ -23,7 +23,6 @@ #include "icing/tokenization/plain-tokenizer.h" #include "icing/tokenization/raw-query-tokenizer.h" #include "icing/tokenization/tokenizer.h" -#include "icing/tokenization/verbatim-tokenizer.h" #include "icing/util/status-macros.h" namespace icing { @@ -39,8 +38,6 @@ CreateIndexingTokenizer(StringIndexingConfig::TokenizerType::Code type, switch (type) { case StringIndexingConfig::TokenizerType::PLAIN: return std::make_unique<PlainTokenizer>(lang_segmenter); - case StringIndexingConfig::TokenizerType::VERBATIM: - return std::make_unique<VerbatimTokenizer>(); case StringIndexingConfig::TokenizerType::NONE: [[fallthrough]]; default: diff --git a/icing/tokenization/tokenizer.h b/icing/tokenization/tokenizer.h index 24f8269..b4f0c6e 100644 --- a/icing/tokenization/tokenizer.h +++ b/icing/tokenization/tokenizer.h @@ -40,6 +40,14 @@ class Tokenizer { public: virtual ~Tokenizer() = default; + enum Type { + // Index tokenizers + PLAIN, // Used to tokenize plain text input + + // Query tokenizers + RAW_QUERY, // Used to tokenize raw queries + }; + // An iterator helping to get tokens. // Example usage: // @@ -75,26 +83,22 @@ class Tokenizer { // offset. // Ex. // auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie(); - // iterator.ResetToTokenStartingAfter(4); + // iterator.ResetToTokenAfter(4); // // The first full token starting after position 4 (the 'b' in "bar") is // // "baz". // PrintToken(iterator.GetToken()); // prints "baz" - virtual bool ResetToTokenStartingAfter(int32_t utf32_offset) { - return false; - } + virtual bool ResetToTokenAfter(int32_t offset) { return false; } // Sets the tokenizer to point at the first token that *ends* *before* // offset. Returns false if there are no valid tokens ending // before offset. // Ex. // auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie(); - // iterator.ResetToTokenEndingBefore(4); + // iterator.ResetToTokenBefore(4); // // The first full token ending before position 4 (the 'b' in "bar") is // // "foo". // PrintToken(iterator.GetToken()); // prints "foo" - virtual bool ResetToTokenEndingBefore(int32_t utf32_offset) { - return false; - } + virtual bool ResetToTokenBefore(int32_t offset) { return false; } virtual bool ResetToStart() { return false; } }; diff --git a/icing/tokenization/verbatim-tokenizer.cc b/icing/tokenization/verbatim-tokenizer.cc deleted file mode 100644 index 0d3a320..0000000 --- a/icing/tokenization/verbatim-tokenizer.cc +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright (C) 2021 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "icing/tokenization/verbatim-tokenizer.h" - -#include "icing/text_classifier/lib3/utils/base/statusor.h" -#include "icing/util/character-iterator.h" -#include "icing/util/status-macros.h" - -namespace icing { -namespace lib { - -class VerbatimTokenIterator : public Tokenizer::Iterator { - public: - explicit VerbatimTokenIterator(std::string_view text) - : term_(std::move(text)) {} - - bool Advance() override { - if (term_.empty() || has_advanced_to_end_) { - return false; - } - - has_advanced_to_end_ = true; - return true; - } - - Token GetToken() const override { - if (term_.empty() || !has_advanced_to_end_) { - return Token(Token::Type::INVALID); - } - - return Token(Token::Type::VERBATIM, term_); - } - - libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart() - override { - if (term_.empty()) { - return absl_ports::AbortedError( - "Could not calculate start of empty token."); - } - - return CharacterIterator(term_, 0, 0, 0); - } - - libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenEndExclusive() - override { - if (term_.empty()) { - return absl_ports::AbortedError( - "Could not calculate end of empty token."); - } - - if (token_end_iterator_.utf8_index() >= 0) { - return token_end_iterator_; - } - - bool moved_to_token_end = token_end_iterator_.MoveToUtf8(term_.length()); - if (moved_to_token_end) { - return token_end_iterator_; - } else { - return absl_ports::AbortedError("Could not move to end of token."); - } - } - - bool ResetToTokenStartingAfter(int32_t utf32_offset) override { - // We can only reset to the sole verbatim token, so we must have a negative - // offset for it to be considered the token after. - if (utf32_offset < 0) { - // Because we are now at the sole verbatim token, we should ensure we can - // no longer advance past it. - has_advanced_to_end_ = true; - return true; - } - return false; - } - - bool ResetToTokenEndingBefore(int32_t utf32_offset) override { - // We can only reset to the sole verbatim token, so we must have an offset - // after the end of the token for the reset to be valid. This means the - // provided utf-32 offset must be equal to or greater than the utf-32 length - // of the token. - if (token_end_iterator_.utf8_index() < 0) { - // Moves one index past the end of the term. - bool moved_to_token_end = token_end_iterator_.MoveToUtf8(term_.length()); - if (!moved_to_token_end) { - // We're unable to reset as we failed to move to the end of the term. - return false; - } - } - - if (utf32_offset >= token_end_iterator_.utf32_index()) { - // Because we are now at the sole verbatim token, we should ensure we can - // no longer advance past it. - has_advanced_to_end_ = true; - return true; - } - return false; - } - - bool ResetToStart() override { - has_advanced_to_end_ = true; - return true; - } - - private: - std::string_view term_; - CharacterIterator token_end_iterator_ = CharacterIterator(term_, -1, -1, -1); - // Used to determine whether we have advanced on the sole verbatim token - bool has_advanced_to_end_ = false; -}; - -libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>> -VerbatimTokenizer::Tokenize(std::string_view text) const { - return std::make_unique<VerbatimTokenIterator>(text); -} - -libtextclassifier3::StatusOr<std::vector<Token>> VerbatimTokenizer::TokenizeAll( - std::string_view text) const { - ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator, - Tokenize(text)); - std::vector<Token> tokens; - while (iterator->Advance()) { - tokens.push_back(iterator->GetToken()); - } - return tokens; -} - -} // namespace lib -} // namespace icing diff --git a/icing/tokenization/verbatim-tokenizer.h b/icing/tokenization/verbatim-tokenizer.h deleted file mode 100644 index 8404cf1..0000000 --- a/icing/tokenization/verbatim-tokenizer.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (C) 2021 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef ICING_TOKENIZATION_VERBATIM_H_ -#define ICING_TOKENIZATION_VERBATIM_H_ - -#include <memory> -#include <string_view> -#include <vector> - -#include "icing/text_classifier/lib3/utils/base/statusor.h" -#include "icing/tokenization/tokenizer.h" - -namespace icing { -namespace lib { - -// Provides verbatim tokenization on input text -class VerbatimTokenizer : public Tokenizer { - public: - libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>> Tokenize( - std::string_view text) const override; - - libtextclassifier3::StatusOr<std::vector<Token>> TokenizeAll( - std::string_view text) const override; -}; - -} // namespace lib -} // namespace icing - -#endif // ICING_TOKENIZATION_VERBATIM_H_ diff --git a/icing/tokenization/verbatim-tokenizer_test.cc b/icing/tokenization/verbatim-tokenizer_test.cc deleted file mode 100644 index e38c7aa..0000000 --- a/icing/tokenization/verbatim-tokenizer_test.cc +++ /dev/null @@ -1,209 +0,0 @@ -// Copyright (C) 2021 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include <string_view> - -#include "gmock/gmock.h" -#include "icing/portable/platform.h" -#include "icing/testing/common-matchers.h" -#include "icing/testing/icu-data-file-helper.h" -#include "icing/testing/jni-test-helpers.h" -#include "icing/testing/test-data.h" -#include "icing/tokenization/language-segmenter-factory.h" -#include "icing/tokenization/tokenizer-factory.h" -#include "icing/util/character-iterator.h" -#include "unicode/uloc.h" - -namespace icing { -namespace lib { -namespace { -using ::testing::ElementsAre; -using ::testing::Eq; -using ::testing::IsEmpty; - -class VerbatimTokenizerTest : public ::testing::Test { - protected: - void SetUp() override { - if (!IsCfStringTokenization() && !IsReverseJniTokenization()) { - ICING_ASSERT_OK( - // File generated via icu_data_file rule in //icing/BUILD. - icu_data_file_helper::SetUpICUDataFile( - GetTestFilePath("icing/icu.dat"))); - } - - jni_cache_ = GetTestJniCache(); - language_segmenter_factory::SegmenterOptions options(ULOC_US, - jni_cache_.get()); - ICING_ASSERT_OK_AND_ASSIGN( - language_segmenter_, - language_segmenter_factory::Create(std::move(options))); - } - - std::unique_ptr<const JniCache> jni_cache_; - std::unique_ptr<LanguageSegmenter> language_segmenter_; -}; - -TEST_F(VerbatimTokenizerTest, Empty) { - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - StringIndexingConfig::TokenizerType::VERBATIM, - language_segmenter_.get())); - - EXPECT_THAT(verbatim_tokenizer->TokenizeAll(""), IsOkAndHolds(IsEmpty())); -} - -TEST_F(VerbatimTokenizerTest, Simple) { - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - StringIndexingConfig::TokenizerType::VERBATIM, - language_segmenter_.get())); - - EXPECT_THAT( - verbatim_tokenizer->TokenizeAll("foo bar"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::VERBATIM, "foo bar")))); -} - -TEST_F(VerbatimTokenizerTest, Punctuation) { - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - StringIndexingConfig::TokenizerType::VERBATIM, - language_segmenter_.get())); - - EXPECT_THAT(verbatim_tokenizer->TokenizeAll("Hello, world!"), - IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::VERBATIM, "Hello, world!")))); -} - -TEST_F(VerbatimTokenizerTest, InvalidTokenBeforeAdvancing) { - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - StringIndexingConfig::TokenizerType::VERBATIM, - language_segmenter_.get())); - - constexpr std::string_view kText = "Hello, world!"; - auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie(); - - // We should get an invalid token if we get the token before advancing. - EXPECT_THAT(token_iterator->GetToken(), - EqualsToken(Token::Type::INVALID, "")); -} - -TEST_F(VerbatimTokenizerTest, ResetToTokenEndingBefore) { - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - StringIndexingConfig::TokenizerType::VERBATIM, - language_segmenter_.get())); - - constexpr std::string_view kText = "Hello, world!"; - auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie(); - - // Reset to beginning of verbatim of token. We provide an offset of 13 as it - // is larger than the final index (12) of the verbatim token. - EXPECT_TRUE(token_iterator->ResetToTokenEndingBefore(13)); - EXPECT_THAT(token_iterator->GetToken(), - EqualsToken(Token::Type::VERBATIM, "Hello, world!")); - - // Ensure our cached character iterator propertly maintains the end of the - // verbatim token. - EXPECT_TRUE(token_iterator->ResetToTokenEndingBefore(13)); - EXPECT_THAT(token_iterator->GetToken(), - EqualsToken(Token::Type::VERBATIM, "Hello, world!")); - - // We should not be able to reset with an offset before or within - // the verbatim token's utf-32 length. - EXPECT_FALSE(token_iterator->ResetToTokenEndingBefore(0)); - EXPECT_FALSE(token_iterator->ResetToTokenEndingBefore(12)); -} - -TEST_F(VerbatimTokenizerTest, ResetToTokenStartingAfter) { - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - StringIndexingConfig::TokenizerType::VERBATIM, - language_segmenter_.get())); - - constexpr std::string_view kText = "Hello, world!"; - auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie(); - - // Get token without resetting - EXPECT_TRUE(token_iterator->Advance()); - EXPECT_THAT(token_iterator->GetToken(), - EqualsToken(Token::Type::VERBATIM, "Hello, world!")); - - // We expect a sole verbatim token, so it's not possible to reset after the - // start of the token. - EXPECT_FALSE(token_iterator->ResetToTokenStartingAfter(1)); - - // We expect to be reset to the sole verbatim token when the offset is - // negative. - EXPECT_TRUE(token_iterator->ResetToTokenStartingAfter(-1)); - EXPECT_THAT(token_iterator->GetToken(), - EqualsToken(Token::Type::VERBATIM, "Hello, world!")); -} - -TEST_F(VerbatimTokenizerTest, ResetToStart) { - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - StringIndexingConfig::TokenizerType::VERBATIM, - language_segmenter_.get())); - - constexpr std::string_view kText = "Hello, world!"; - auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie(); - - // Get token without resetting - EXPECT_TRUE(token_iterator->Advance()); - EXPECT_THAT(token_iterator->GetToken(), - EqualsToken(Token::Type::VERBATIM, "Hello, world!")); - - // Retrieve token again after resetting to start - EXPECT_TRUE(token_iterator->ResetToStart()); - EXPECT_THAT(token_iterator->GetToken(), - EqualsToken(Token::Type::VERBATIM, "Hello, world!")); -} - -TEST_F(VerbatimTokenizerTest, CalculateTokenStart) { - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - StringIndexingConfig::TokenizerType::VERBATIM, - language_segmenter_.get())); - - constexpr std::string_view kText = "Hello, world!"; - auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie(); - - ICING_ASSERT_OK_AND_ASSIGN(CharacterIterator start_character_iterator, - token_iterator->CalculateTokenStart()); - - // We should retrieve the character 'H', the first character of the token. - EXPECT_THAT(start_character_iterator.GetCurrentChar(), Eq('H')); -} - -TEST_F(VerbatimTokenizerTest, CalculateTokenEnd) { - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - StringIndexingConfig::TokenizerType::VERBATIM, - language_segmenter_.get())); - - constexpr std::string_view kText = "Hello, world!"; - auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie(); - - ICING_ASSERT_OK_AND_ASSIGN(CharacterIterator end_character_iterator, - token_iterator->CalculateTokenEndExclusive()); - - // We should retrieve the the null character, as the returned character - // iterator will be set one past the end of the token. - EXPECT_THAT(end_character_iterator.GetCurrentChar(), Eq('\0')); -} - -} // namespace -} // namespace lib -} // namespace icing diff --git a/icing/transform/icu/icu-normalizer.cc b/icing/transform/icu/icu-normalizer.cc index aceb11d..eb0eead 100644 --- a/icing/transform/icu/icu-normalizer.cc +++ b/icing/transform/icu/icu-normalizer.cc @@ -29,7 +29,6 @@ #include "icing/util/status-macros.h" #include "unicode/umachine.h" #include "unicode/unorm2.h" -#include "unicode/ustring.h" #include "unicode/utrans.h" namespace icing { @@ -158,18 +157,14 @@ std::string IcuNormalizer::NormalizeLatin(const UNormalizer2* normalizer2, const std::string_view term) const { std::string result; result.reserve(term.length()); - int current_pos = 0; - while (current_pos < term.length()) { - if (i18n_utils::IsAscii(term[current_pos])) { - result.push_back(std::tolower(term[current_pos])); - ++current_pos; - } else { - UChar32 uchar32 = - i18n_utils::GetUChar32At(term.data(), term.length(), current_pos); + for (int i = 0; i < term.length(); i++) { + if (i18n_utils::IsAscii(term[i])) { + result.push_back(std::tolower(term[i])); + } else if (i18n_utils::IsLeadUtf8Byte(term[i])) { + UChar32 uchar32 = i18n_utils::GetUChar32At(term.data(), term.length(), i); if (uchar32 == i18n_utils::kInvalidUChar32) { ICING_LOG(WARNING) << "Unable to get uchar32 from " << term - << " at position" << current_pos; - current_pos += i18n_utils::GetUtf8Length(uchar32); + << " at position" << i; continue; } char ascii_char; @@ -182,9 +177,8 @@ std::string IcuNormalizer::NormalizeLatin(const UNormalizer2* normalizer2, // tokenized. We handle it here in case there're something wrong with // the tokenizers. int utf8_length = i18n_utils::GetUtf8Length(uchar32); - absl_ports::StrAppend(&result, term.substr(current_pos, utf8_length)); + absl_ports::StrAppend(&result, term.substr(i, utf8_length)); } - current_pos += i18n_utils::GetUtf8Length(uchar32); } } @@ -267,106 +261,5 @@ std::string IcuNormalizer::TermTransformer::Transform( return std::move(utf8_term_or).ValueOrDie(); } -CharacterIterator FindNormalizedLatinMatchEndPosition( - const UNormalizer2* normalizer2, std::string_view term, - CharacterIterator char_itr, std::string_view normalized_term) { - CharacterIterator normalized_char_itr(normalized_term); - char ascii_char; - while (char_itr.utf8_index() < term.length() && - normalized_char_itr.utf8_index() < normalized_term.length()) { - UChar32 c = char_itr.GetCurrentChar(); - if (i18n_utils::IsAscii(c)) { - c = std::tolower(c); - } else if (DiacriticCharToAscii(normalizer2, c, &ascii_char)) { - c = ascii_char; - } - UChar32 normalized_c = normalized_char_itr.GetCurrentChar(); - if (c != normalized_c) { - return char_itr; - } - char_itr.AdvanceToUtf32(char_itr.utf32_index() + 1); - normalized_char_itr.AdvanceToUtf32(normalized_char_itr.utf32_index() + 1); - } - return char_itr; -} - -CharacterIterator -IcuNormalizer::TermTransformer::FindNormalizedNonLatinMatchEndPosition( - std::string_view term, CharacterIterator char_itr, - std::string_view normalized_term) const { - CharacterIterator normalized_char_itr(normalized_term); - UErrorCode status = U_ZERO_ERROR; - - constexpr int kUtf16CharBufferLength = 6; - UChar c16[kUtf16CharBufferLength]; - int32_t c16_length; - int32_t limit; - - constexpr int kCharBufferLength = 3 * 4; - char normalized_buffer[kCharBufferLength]; - int32_t c8_length; - while (char_itr.utf8_index() < term.length() && - normalized_char_itr.utf8_index() < normalized_term.length()) { - UChar32 c = char_itr.GetCurrentChar(); - int c_lenth = i18n_utils::GetUtf8Length(c); - u_strFromUTF8(c16, kUtf16CharBufferLength, &c16_length, - term.data() + char_itr.utf8_index(), - /*srcLength=*/c_lenth, &status); - if (U_FAILURE(status)) { - break; - } - - limit = c16_length; - utrans_transUChars(u_transliterator_, c16, &c16_length, - kUtf16CharBufferLength, - /*start=*/0, &limit, &status); - if (U_FAILURE(status)) { - break; - } - - u_strToUTF8(normalized_buffer, kCharBufferLength, &c8_length, c16, - c16_length, &status); - if (U_FAILURE(status)) { - break; - } - - for (int i = 0; i < c8_length; ++i) { - if (normalized_buffer[i] != - normalized_term[normalized_char_itr.utf8_index() + i]) { - return char_itr; - } - } - normalized_char_itr.AdvanceToUtf8(normalized_char_itr.utf8_index() + - c8_length); - char_itr.AdvanceToUtf32(char_itr.utf32_index() + 1); - } - if (U_FAILURE(status)) { - // Failed to transform, return its original form. - ICING_LOG(WARNING) << "Failed to normalize UTF8 term: " << term; - } - return char_itr; -} - -CharacterIterator IcuNormalizer::FindNormalizedMatchEndPosition( - std::string_view term, std::string_view normalized_term) const { - UErrorCode status = U_ZERO_ERROR; - // ICU manages the singleton instance - const UNormalizer2* normalizer2 = unorm2_getNFCInstance(&status); - if (U_FAILURE(status)) { - ICING_LOG(WARNING) << "Failed to create a UNormalizer2 instance"; - } - - CharacterIterator char_itr(term); - UChar32 first_uchar32 = char_itr.GetCurrentChar(); - if (normalizer2 != nullptr && first_uchar32 != i18n_utils::kInvalidUChar32 && - DiacriticCharToAscii(normalizer2, first_uchar32, /*char_out=*/nullptr)) { - return FindNormalizedLatinMatchEndPosition(normalizer2, term, char_itr, - normalized_term); - } else { - return term_transformer_->FindNormalizedNonLatinMatchEndPosition( - term, char_itr, normalized_term); - } -} - } // namespace lib } // namespace icing diff --git a/icing/transform/icu/icu-normalizer.h b/icing/transform/icu/icu-normalizer.h index d4f1ebd..f20a9fb 100644 --- a/icing/transform/icu/icu-normalizer.h +++ b/icing/transform/icu/icu-normalizer.h @@ -21,7 +21,6 @@ #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/transform/normalizer.h" -#include "icing/util/character-iterator.h" #include "unicode/unorm2.h" #include "unicode/utrans.h" @@ -57,17 +56,6 @@ class IcuNormalizer : public Normalizer { // result in the non-Latin characters not properly being normalized std::string NormalizeTerm(std::string_view term) const override; - // Returns a CharacterIterator pointing to one past the end of the segment of - // term that (once normalized) matches with normalized_term. - // - // Ex. FindNormalizedMatchEndPosition("YELLOW", "yell") will return - // CharacterIterator(u8:4, u16:4, u32:4). - // - // Ex. FindNormalizedMatchEndPosition("YELLOW", "red") will return - // CharacterIterator(u8:0, u16:0, u32:0). - CharacterIterator FindNormalizedMatchEndPosition( - std::string_view term, std::string_view normalized_term) const override; - private: // A handler class that helps manage the lifecycle of UTransliterator. It's // used in IcuNormalizer to transform terms into the formats we need. @@ -87,12 +75,6 @@ class IcuNormalizer : public Normalizer { // Transforms the text based on our rules described at top of this file std::string Transform(std::string_view term) const; - // Returns a CharacterIterator pointing to one past the end of the segment - // of a non-latin term that (once normalized) matches with normalized_term. - CharacterIterator FindNormalizedNonLatinMatchEndPosition( - std::string_view term, CharacterIterator char_itr, - std::string_view normalized_term) const; - private: explicit TermTransformer(UTransliterator* u_transliterator); diff --git a/icing/transform/icu/icu-normalizer_benchmark.cc b/icing/transform/icu/icu-normalizer_benchmark.cc index fdd4c70..b037538 100644 --- a/icing/transform/icu/icu-normalizer_benchmark.cc +++ b/icing/transform/icu/icu-normalizer_benchmark.cc @@ -14,8 +14,8 @@ #include "testing/base/public/benchmark.h" #include "gmock/gmock.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/testing/common-matchers.h" -#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/test-data.h" #include "icing/transform/normalizer-factory.h" #include "icing/transform/normalizer.h" @@ -161,124 +161,6 @@ BENCHMARK(BM_NormalizeHiragana) ->Arg(2048000) ->Arg(4096000); -void BM_UppercaseSubTokenLength(benchmark::State& state) { - bool run_via_adb = absl::GetFlag(FLAGS_adb); - if (!run_via_adb) { - ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile( - GetTestFilePath("icing/icu.dat"))); - } - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Normalizer> normalizer, - normalizer_factory::Create( - - /*max_term_byte_size=*/std::numeric_limits<int>::max())); - - std::string input_string(state.range(0), 'A'); - std::string normalized_input_string(state.range(0), 'a'); - for (auto _ : state) { - normalizer->FindNormalizedMatchEndPosition(input_string, - normalized_input_string); - } -} -BENCHMARK(BM_UppercaseSubTokenLength) - ->Arg(1000) - ->Arg(2000) - ->Arg(4000) - ->Arg(8000) - ->Arg(16000) - ->Arg(32000) - ->Arg(64000) - ->Arg(128000) - ->Arg(256000) - ->Arg(384000) - ->Arg(512000) - ->Arg(1024000) - ->Arg(2048000) - ->Arg(4096000); - -void BM_AccentSubTokenLength(benchmark::State& state) { - bool run_via_adb = absl::GetFlag(FLAGS_adb); - if (!run_via_adb) { - ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile( - GetTestFilePath("icing/icu.dat"))); - } - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Normalizer> normalizer, - normalizer_factory::Create( - - /*max_term_byte_size=*/std::numeric_limits<int>::max())); - - std::string input_string; - std::string normalized_input_string; - while (input_string.length() < state.range(0)) { - input_string.append("àáâãā"); - normalized_input_string.append("aaaaa"); - } - - for (auto _ : state) { - normalizer->FindNormalizedMatchEndPosition(input_string, - normalized_input_string); - } -} -BENCHMARK(BM_AccentSubTokenLength) - ->Arg(1000) - ->Arg(2000) - ->Arg(4000) - ->Arg(8000) - ->Arg(16000) - ->Arg(32000) - ->Arg(64000) - ->Arg(128000) - ->Arg(256000) - ->Arg(384000) - ->Arg(512000) - ->Arg(1024000) - ->Arg(2048000) - ->Arg(4096000); - -void BM_HiraganaSubTokenLength(benchmark::State& state) { - bool run_via_adb = absl::GetFlag(FLAGS_adb); - if (!run_via_adb) { - ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile( - GetTestFilePath("icing/icu.dat"))); - } - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Normalizer> normalizer, - normalizer_factory::Create( - - /*max_term_byte_size=*/std::numeric_limits<int>::max())); - - std::string input_string; - std::string normalized_input_string; - while (input_string.length() < state.range(0)) { - input_string.append("あいうえお"); - normalized_input_string.append("アイウエオ"); - } - - for (auto _ : state) { - normalizer->FindNormalizedMatchEndPosition(input_string, - normalized_input_string); - } -} -BENCHMARK(BM_HiraganaSubTokenLength) - ->Arg(1000) - ->Arg(2000) - ->Arg(4000) - ->Arg(8000) - ->Arg(16000) - ->Arg(32000) - ->Arg(64000) - ->Arg(128000) - ->Arg(256000) - ->Arg(384000) - ->Arg(512000) - ->Arg(1024000) - ->Arg(2048000) - ->Arg(4096000); - } // namespace } // namespace lib diff --git a/icing/transform/icu/icu-normalizer_test.cc b/icing/transform/icu/icu-normalizer_test.cc index 143da17..f5d20ff 100644 --- a/icing/transform/icu/icu-normalizer_test.cc +++ b/icing/transform/icu/icu-normalizer_test.cc @@ -16,8 +16,8 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/testing/common-matchers.h" -#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/icu-i18n-test-utils.h" #include "icing/testing/test-data.h" #include "icing/transform/normalizer-factory.h" @@ -231,104 +231,6 @@ TEST_F(IcuNormalizerTest, Truncate) { } } -TEST_F(IcuNormalizerTest, PrefixMatchLength) { - // Verify that FindNormalizedMatchEndPosition will properly find the length of - // the prefix match when given a non-normalized term and a normalized term - // is a prefix of the non-normalized one. - ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( - /*max_term_byte_size=*/1000)); - - // Upper to lower - std::string term = "MDI"; - CharacterIterator match_end = - normalizer->FindNormalizedMatchEndPosition(term, "md"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("MD")); - - term = "Icing"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "icin"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Icin")); - - // Full-width - term = "525600"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "525"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("525")); - - term = "FULLWIDTH"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "full"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("FULL")); - - // Hiragana to Katakana - term = "あいうえお"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "アイ"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("あい")); - - term = "かきくけこ"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "カ"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("か")); - - // Latin accents - term = "Zürich"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "zur"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Zür")); - - term = "après-midi"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "apre"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("aprè")); - - term = "Buenos días"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "buenos di"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Buenos dí")); -} - -TEST_F(IcuNormalizerTest, SharedPrefixMatchLength) { - // Verify that FindNormalizedMatchEndPosition will properly find the length of - // the prefix match when given a non-normalized term and a normalized term - // that share a common prefix. - ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( - /*max_term_byte_size=*/1000)); - - // Upper to lower - std::string term = "MDI"; - CharacterIterator match_end = - normalizer->FindNormalizedMatchEndPosition(term, "mgm"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("M")); - - term = "Icing"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "icky"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Ic")); - - // Full-width - term = "525600"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "525788"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("525")); - - term = "FULLWIDTH"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "fully"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("FULL")); - - // Hiragana to Katakana - term = "あいうえお"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "アイエオ"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("あい")); - - term = "かきくけこ"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "カケコ"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("か")); - - // Latin accents - term = "Zürich"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "zurg"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Zür")); - - term = "après-midi"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "apreciate"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("aprè")); - - term = "días"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "diamond"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("día")); -} - } // namespace } // namespace lib } // namespace icing diff --git a/icing/transform/map/map-normalizer.cc b/icing/transform/map/map-normalizer.cc index 61fce65..c888551 100644 --- a/icing/transform/map/map-normalizer.cc +++ b/icing/transform/map/map-normalizer.cc @@ -14,7 +14,8 @@ #include "icing/transform/map/map-normalizer.h" -#include <cctype> +#include <ctype.h> + #include <string> #include <string_view> #include <unordered_map> @@ -22,7 +23,6 @@ #include "icing/absl_ports/str_cat.h" #include "icing/transform/map/normalization-map.h" -#include "icing/util/character-iterator.h" #include "icing/util/i18n-utils.h" #include "icing/util/logging.h" #include "unicode/utypes.h" @@ -30,70 +30,48 @@ namespace icing { namespace lib { -namespace { - -UChar32 NormalizeChar(UChar32 c) { - if (i18n_utils::GetUtf16Length(c) > 1) { - // All the characters we need to normalize can be encoded into a - // single char16_t. If this character needs more than 1 char16_t code - // unit, we can skip normalization and append it directly. - return c; - } - - // The original character can be encoded into a single char16_t. - const std::unordered_map<char16_t, char16_t>* normalization_map = - GetNormalizationMap(); - if (normalization_map == nullptr) { - // Normalization map couldn't be properly initialized, append the original - // character. - ICING_LOG(WARNING) << "Unable to get a valid pointer to normalization map!"; - return c; - } - auto iterator = normalization_map->find(static_cast<char16_t>(c)); - if (iterator == normalization_map->end()) { - // Normalization mapping not found, append the original character. - return c; - } - - // Found a normalization mapping. The normalized character (stored in a - // char16_t) can have 1 or 2 bytes. - if (i18n_utils::IsAscii(iterator->second)) { - // The normalized character has 1 byte. It may be an upper-case char. - // Lower-case it before returning it. - return std::tolower(static_cast<char>(iterator->second)); - } else { - return iterator->second; - } -} - -} // namespace - std::string MapNormalizer::NormalizeTerm(std::string_view term) const { std::string normalized_text; normalized_text.reserve(term.length()); - int current_pos = 0; - while (current_pos < term.length()) { - if (i18n_utils::IsAscii(term[current_pos])) { - normalized_text.push_back(std::tolower(term[current_pos])); - ++current_pos; - } else { - UChar32 uchar32 = - i18n_utils::GetUChar32At(term.data(), term.length(), current_pos); + for (int i = 0; i < term.length(); ++i) { + if (i18n_utils::IsAscii(term[i])) { + // The original character has 1 byte. + normalized_text.push_back(std::tolower(term[i])); + } else if (i18n_utils::IsLeadUtf8Byte(term[i])) { + UChar32 uchar32 = i18n_utils::GetUChar32At(term.data(), term.length(), i); if (uchar32 == i18n_utils::kInvalidUChar32) { ICING_LOG(WARNING) << "Unable to get uchar32 from " << term - << " at position" << current_pos; - ++current_pos; + << " at position" << i; continue; } - UChar32 normalized_char32 = NormalizeChar(uchar32); - if (i18n_utils::IsAscii(normalized_char32)) { - normalized_text.push_back(normalized_char32); + int utf8_length = i18n_utils::GetUtf8Length(uchar32); + if (i18n_utils::GetUtf16Length(uchar32) > 1) { + // All the characters we need to normalize can be encoded into a + // single char16_t. If this character needs more than 1 char16_t code + // unit, we can skip normalization and append it directly. + absl_ports::StrAppend(&normalized_text, term.substr(i, utf8_length)); + continue; + } + // The original character can be encoded into a single char16_t. + const std::unordered_map<char16_t, char16_t>& normalization_map = + GetNormalizationMap(); + auto iterator = normalization_map.find(static_cast<char16_t>(uchar32)); + if (iterator != normalization_map.end()) { + // Found a normalization mapping. The normalized character (stored in a + // char16_t) can have 1 or 2 bytes. + if (i18n_utils::IsAscii(iterator->second)) { + // The normalized character has 1 byte. + normalized_text.push_back( + std::tolower(static_cast<char>(iterator->second))); + } else { + // The normalized character has 2 bytes. + i18n_utils::AppendUchar32ToUtf8(&normalized_text, iterator->second); + } } else { - // The normalized character has 2 bytes. - i18n_utils::AppendUchar32ToUtf8(&normalized_text, normalized_char32); + // Normalization mapping not found, append the original character. + absl_ports::StrAppend(&normalized_text, term.substr(i, utf8_length)); } - current_pos += i18n_utils::GetUtf8Length(uchar32); } } @@ -104,27 +82,5 @@ std::string MapNormalizer::NormalizeTerm(std::string_view term) const { return normalized_text; } -CharacterIterator MapNormalizer::FindNormalizedMatchEndPosition( - std::string_view term, std::string_view normalized_term) const { - CharacterIterator char_itr(term); - CharacterIterator normalized_char_itr(normalized_term); - while (char_itr.utf8_index() < term.length() && - normalized_char_itr.utf8_index() < normalized_term.length()) { - UChar32 c = char_itr.GetCurrentChar(); - if (i18n_utils::IsAscii(c)) { - c = std::tolower(c); - } else { - c = NormalizeChar(c); - } - UChar32 normalized_c = normalized_char_itr.GetCurrentChar(); - if (c != normalized_c) { - return char_itr; - } - char_itr.AdvanceToUtf32(char_itr.utf32_index() + 1); - normalized_char_itr.AdvanceToUtf32(normalized_char_itr.utf32_index() + 1); - } - return char_itr; -} - } // namespace lib } // namespace icing diff --git a/icing/transform/map/map-normalizer.h b/icing/transform/map/map-normalizer.h index ed996ae..f9c0e42 100644 --- a/icing/transform/map/map-normalizer.h +++ b/icing/transform/map/map-normalizer.h @@ -19,7 +19,6 @@ #include <string_view> #include "icing/transform/normalizer.h" -#include "icing/util/character-iterator.h" namespace icing { namespace lib { @@ -40,17 +39,6 @@ class MapNormalizer : public Normalizer { // Read more mapping details in normalization-map.cc std::string NormalizeTerm(std::string_view term) const override; - // Returns a CharacterIterator pointing to one past the end of the segment of - // term that (once normalized) matches with normalized_term. - // - // Ex. FindNormalizedMatchEndPosition("YELLOW", "yell") will return - // CharacterIterator(u8:4, u16:4, u32:4). - // - // Ex. FindNormalizedMatchEndPosition("YELLOW", "red") will return - // CharacterIterator(u8:0, u16:0, u32:0). - CharacterIterator FindNormalizedMatchEndPosition( - std::string_view term, std::string_view normalized_term) const override; - private: // The maximum term length allowed after normalization. int max_term_byte_size_; diff --git a/icing/transform/map/map-normalizer_benchmark.cc b/icing/transform/map/map-normalizer_benchmark.cc index 8268541..691afc6 100644 --- a/icing/transform/map/map-normalizer_benchmark.cc +++ b/icing/transform/map/map-normalizer_benchmark.cc @@ -143,104 +143,6 @@ BENCHMARK(BM_NormalizeHiragana) ->Arg(2048000) ->Arg(4096000); -void BM_UppercaseSubTokenLength(benchmark::State& state) { - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Normalizer> normalizer, - normalizer_factory::Create( - - /*max_term_byte_size=*/std::numeric_limits<int>::max())); - - std::string input_string(state.range(0), 'A'); - std::string normalized_input_string(state.range(0), 'a'); - for (auto _ : state) { - normalizer->FindNormalizedMatchEndPosition(input_string, - normalized_input_string); - } -} -BENCHMARK(BM_UppercaseSubTokenLength) - ->Arg(1000) - ->Arg(2000) - ->Arg(4000) - ->Arg(8000) - ->Arg(16000) - ->Arg(32000) - ->Arg(64000) - ->Arg(128000) - ->Arg(256000) - ->Arg(384000) - ->Arg(512000) - ->Arg(1024000) - ->Arg(2048000) - ->Arg(4096000); - -void BM_AccentSubTokenLength(benchmark::State& state) { - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Normalizer> normalizer, - normalizer_factory::Create( - /*max_term_byte_size=*/std::numeric_limits<int>::max())); - - std::string input_string; - std::string normalized_input_string; - while (input_string.length() < state.range(0)) { - input_string.append("àáâãā"); - normalized_input_string.append("aaaaa"); - } - - for (auto _ : state) { - normalizer->FindNormalizedMatchEndPosition(input_string, - normalized_input_string); - } -} -BENCHMARK(BM_AccentSubTokenLength) - ->Arg(1000) - ->Arg(2000) - ->Arg(4000) - ->Arg(8000) - ->Arg(16000) - ->Arg(32000) - ->Arg(64000) - ->Arg(128000) - ->Arg(256000) - ->Arg(384000) - ->Arg(512000) - ->Arg(1024000) - ->Arg(2048000) - ->Arg(4096000); - -void BM_HiraganaSubTokenLength(benchmark::State& state) { - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Normalizer> normalizer, - normalizer_factory::Create( - /*max_term_byte_size=*/std::numeric_limits<int>::max())); - - std::string input_string; - std::string normalized_input_string; - while (input_string.length() < state.range(0)) { - input_string.append("あいうえお"); - normalized_input_string.append("アイウエオ"); - } - - for (auto _ : state) { - normalizer->FindNormalizedMatchEndPosition(input_string, - normalized_input_string); - } -} -BENCHMARK(BM_HiraganaSubTokenLength) - ->Arg(1000) - ->Arg(2000) - ->Arg(4000) - ->Arg(8000) - ->Arg(16000) - ->Arg(32000) - ->Arg(64000) - ->Arg(128000) - ->Arg(256000) - ->Arg(384000) - ->Arg(512000) - ->Arg(1024000) - ->Arg(2048000) - ->Arg(4096000); - } // namespace } // namespace lib diff --git a/icing/transform/map/map-normalizer_test.cc b/icing/transform/map/map-normalizer_test.cc index adc5623..b62ae0e 100644 --- a/icing/transform/map/map-normalizer_test.cc +++ b/icing/transform/map/map-normalizer_test.cc @@ -23,7 +23,6 @@ #include "icing/testing/icu-i18n-test-utils.h" #include "icing/transform/normalizer-factory.h" #include "icing/transform/normalizer.h" -#include "icing/util/character-iterator.h" namespace icing { namespace lib { @@ -200,104 +199,6 @@ TEST(MapNormalizerTest, Truncate) { } } -TEST(MapNormalizerTest, PrefixMatchLength) { - // Verify that FindNormalizedMatchEndPosition will properly find the length of - // the prefix match when given a non-normalized term and a normalized term - // is a prefix of the non-normalized one. - ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( - /*max_term_byte_size=*/1000)); - - // Upper to lower - std::string term = "MDI"; - CharacterIterator match_end = - normalizer->FindNormalizedMatchEndPosition(term, "md"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("MD")); - - term = "Icing"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "icin"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Icin")); - - // Full-width - term = "525600"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "525"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("525")); - - term = "FULLWIDTH"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "full"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("FULL")); - - // Hiragana to Katakana - term = "あいうえお"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "アイ"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("あい")); - - term = "かきくけこ"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "カ"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("か")); - - // Latin accents - term = "Zürich"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "zur"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Zür")); - - term = "après-midi"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "apre"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("aprè")); - - term = "Buenos días"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "buenos di"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Buenos dí")); -} - -TEST(MapNormalizerTest, SharedPrefixMatchLength) { - // Verify that FindNormalizedMatchEndPosition will properly find the length of - // the prefix match when given a non-normalized term and a normalized term - // that share a common prefix. - ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( - /*max_term_byte_size=*/1000)); - - // Upper to lower - std::string term = "MDI"; - CharacterIterator match_end = - normalizer->FindNormalizedMatchEndPosition(term, "mgm"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("M")); - - term = "Icing"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "icky"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Ic")); - - // Full-width - term = "525600"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "525788"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("525")); - - term = "FULLWIDTH"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "fully"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("FULL")); - - // Hiragana to Katakana - term = "あいうえお"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "アイエオ"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("あい")); - - term = "かきくけこ"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "カケコ"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("か")); - - // Latin accents - term = "Zürich"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "zurg"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Zür")); - - term = "après-midi"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "apreciate"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("aprè")); - - term = "días"; - match_end = normalizer->FindNormalizedMatchEndPosition(term, "diamond"); - EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("día")); -} - } // namespace } // namespace lib diff --git a/icing/transform/map/normalization-map.cc b/icing/transform/map/normalization-map.cc index 0994ab8..c318036 100644 --- a/icing/transform/map/normalization-map.cc +++ b/icing/transform/map/normalization-map.cc @@ -691,21 +691,19 @@ constexpr NormalizationPair kNormalizationMappings[] = { } // namespace -const std::unordered_map<char16_t, char16_t> *GetNormalizationMap() { +const std::unordered_map<char16_t, char16_t>& GetNormalizationMap() { // The map is allocated dynamically the first time this function is executed. - static const std::unordered_map<char16_t, char16_t> *const normalization_map = - [] { - auto *map = new std::unordered_map<char16_t, char16_t>(); - // Size of all the mappings is about 2.5 KiB. - constexpr int numMappings = - sizeof(kNormalizationMappings) / sizeof(NormalizationPair); - map->reserve(numMappings); - for (size_t i = 0; i < numMappings; ++i) { - map->emplace(kNormalizationMappings[i].from, - kNormalizationMappings[i].to); - } - return map; - }(); + static const std::unordered_map<char16_t, char16_t> normalization_map = [] { + std::unordered_map<char16_t, char16_t> map; + // Size of all the mappings is about 2.5 KiB. + constexpr int numMappings = + sizeof(kNormalizationMappings) / sizeof(NormalizationPair); + map.reserve(numMappings); + for (size_t i = 0; i < numMappings; ++i) { + map.emplace(kNormalizationMappings[i].from, kNormalizationMappings[i].to); + } + return map; + }(); return normalization_map; } diff --git a/icing/transform/map/normalization-map.h b/icing/transform/map/normalization-map.h index ac7872b..aea85bd 100644 --- a/icing/transform/map/normalization-map.h +++ b/icing/transform/map/normalization-map.h @@ -23,7 +23,7 @@ namespace lib { // Returns a map containing normalization mappings. A mapping (A -> B) means // that we'll transform every character 'A' into 'B'. See normalization-map.cc // for mapping details. -const std::unordered_map<char16_t, char16_t>* GetNormalizationMap(); +const std::unordered_map<char16_t, char16_t>& GetNormalizationMap(); } // namespace lib } // namespace icing diff --git a/icing/transform/normalizer.h b/icing/transform/normalizer.h index 2110f0f..4cbfa63 100644 --- a/icing/transform/normalizer.h +++ b/icing/transform/normalizer.h @@ -20,7 +20,6 @@ #include <string_view> #include "icing/text_classifier/lib3/utils/base/statusor.h" -#include "icing/util/character-iterator.h" namespace icing { namespace lib { @@ -40,17 +39,6 @@ class Normalizer { // Normalizes the input term based on rules. See implementation classes for // specific transformation rules. virtual std::string NormalizeTerm(std::string_view term) const = 0; - - // Returns a CharacterIterator pointing to one past the end of the segment of - // term that (once normalized) matches with normalized_term. - // - // Ex. FindNormalizedMatchEndPosition("YELLOW", "yell") will return - // CharacterIterator(u8:4, u16:4, u32:4). - // - // Ex. FindNormalizedMatchEndPosition("YELLOW", "red") will return - // CharacterIterator(u8:0, u16:0, u32:0). - virtual CharacterIterator FindNormalizedMatchEndPosition( - std::string_view term, std::string_view normalized_term) const = 0; }; } // namespace lib diff --git a/icing/transform/simple/none-normalizer-factory.cc b/icing/transform/simple/none-normalizer-factory.cc new file mode 100644 index 0000000..6b35270 --- /dev/null +++ b/icing/transform/simple/none-normalizer-factory.cc @@ -0,0 +1,53 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_FACTORY_H_ +#define ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_FACTORY_H_ + +#include <memory> +#include <string_view> + +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/transform/normalizer.h" +#include "icing/transform/simple/none-normalizer.h" + +namespace icing { +namespace lib { + +namespace normalizer_factory { + +// Creates a dummy normalizer. The term is not normalized, but +// the text will be truncated to max_term_byte_size if it exceeds the max size. +// +// Returns: +// A normalizer on success +// INVALID_ARGUMENT if max_term_byte_size <= 0 +// INTERNAL_ERROR on errors +libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create( + int max_term_byte_size) { + if (max_term_byte_size <= 0) { + return absl_ports::InvalidArgumentError( + "max_term_byte_size must be greater than zero."); + } + + return std::make_unique<NoneNormalizer>(max_term_byte_size); +} + +} // namespace normalizer_factory + +} // namespace lib +} // namespace icing + +#endif // ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_FACTORY_H_ diff --git a/icing/transform/simple/none-normalizer.h b/icing/transform/simple/none-normalizer.h new file mode 100644 index 0000000..47085e1 --- /dev/null +++ b/icing/transform/simple/none-normalizer.h @@ -0,0 +1,51 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_H_ +#define ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_H_ + +#include <string> +#include <string_view> + +#include "icing/transform/normalizer.h" + +namespace icing { +namespace lib { + +// This normalizer is not meant for production use. Currently only used to get +// the Icing library to compile in Jetpack. +// +// No normalization is done, but the term is truncated if it exceeds +// max_term_byte_size. +class NoneNormalizer : public Normalizer { + public: + explicit NoneNormalizer(int max_term_byte_size) + : max_term_byte_size_(max_term_byte_size){}; + + std::string NormalizeTerm(std::string_view term) const override { + if (term.length() > max_term_byte_size_) { + return std::string(term.substr(0, max_term_byte_size_)); + } + return std::string(term); + } + + private: + // The maximum term length allowed after normalization. + int max_term_byte_size_; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_H_ diff --git a/icing/transform/simple/none-normalizer_test.cc b/icing/transform/simple/none-normalizer_test.cc new file mode 100644 index 0000000..e074828 --- /dev/null +++ b/icing/transform/simple/none-normalizer_test.cc @@ -0,0 +1,74 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <memory> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/testing/common-matchers.h" +#include "icing/transform/normalizer-factory.h" +#include "icing/transform/normalizer.h" + +namespace icing { +namespace lib { +namespace { + +using ::testing::Eq; + +TEST(NoneNormalizerTest, Creation) { + EXPECT_THAT(normalizer_factory::Create( + /*max_term_byte_size=*/5), + IsOk()); + EXPECT_THAT(normalizer_factory::Create( + /*max_term_byte_size=*/0), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(normalizer_factory::Create( + /*max_term_byte_size=*/-1), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST(IcuNormalizerTest, NoNormalizationDone) { + ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( + /*max_term_byte_size=*/1000)); + EXPECT_THAT(normalizer->NormalizeTerm(""), Eq("")); + EXPECT_THAT(normalizer->NormalizeTerm("hello world"), Eq("hello world")); + + // Capitalization + EXPECT_THAT(normalizer->NormalizeTerm("MDI"), Eq("MDI")); + + // Accents + EXPECT_THAT(normalizer->NormalizeTerm("Zürich"), Eq("Zürich")); + + // Full-width punctuation to ASCII punctuation + EXPECT_THAT(normalizer->NormalizeTerm("。,!?:”"), Eq("。,!?:”")); + + // Half-width katakana + EXPECT_THAT(normalizer->NormalizeTerm("カ"), Eq("カ")); +} + +TEST(NoneNormalizerTest, Truncate) { + ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( + /*max_term_byte_size=*/5)); + + // Won't be truncated + EXPECT_THAT(normalizer->NormalizeTerm("hi"), Eq("hi")); + EXPECT_THAT(normalizer->NormalizeTerm("hello"), Eq("hello")); + + // Truncated to length 5. + EXPECT_THAT(normalizer->NormalizeTerm("hello!"), Eq("hello")); +} + +} // namespace +} // namespace lib +} // namespace icing diff --git a/icing/util/character-iterator.cc b/icing/util/character-iterator.cc index 0ab1e50..6c5faef 100644 --- a/icing/util/character-iterator.cc +++ b/icing/util/character-iterator.cc @@ -14,8 +14,6 @@ #include "icing/util/character-iterator.h" -#include "icing/util/i18n-utils.h" - namespace icing { namespace lib { @@ -32,37 +30,22 @@ int GetUTF8StartPosition(std::string_view text, int current_byte_index) { } // namespace -UChar32 CharacterIterator::GetCurrentChar() { - if (cached_current_char_ == i18n_utils::kInvalidUChar32) { - // Our indices point to the right character, we just need to read that - // character. No need to worry about an error. If GetUChar32At fails, then - // current_char will be i18n_utils::kInvalidUChar32. - cached_current_char_ = - i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); - } - return cached_current_char_; -} - bool CharacterIterator::MoveToUtf8(int desired_utf8_index) { return (desired_utf8_index > utf8_index_) ? AdvanceToUtf8(desired_utf8_index) : RewindToUtf8(desired_utf8_index); } bool CharacterIterator::AdvanceToUtf8(int desired_utf8_index) { - ResetToStartIfNecessary(); - if (desired_utf8_index > text_.length()) { // Enforce the requirement. return false; } // Need to work forwards. - UChar32 uchar32 = cached_current_char_; while (utf8_index_ < desired_utf8_index) { - uchar32 = + UChar32 uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); if (uchar32 == i18n_utils::kInvalidUChar32) { // Unable to retrieve a valid UTF-32 character at the previous position. - cached_current_char_ = i18n_utils::kInvalidUChar32; return false; } int utf8_length = i18n_utils::GetUtf8Length(uchar32); @@ -74,8 +57,6 @@ bool CharacterIterator::AdvanceToUtf8(int desired_utf8_index) { utf16_index_ += i18n_utils::GetUtf16Length(uchar32); ++utf32_index_; } - cached_current_char_ = - i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); return true; } @@ -85,30 +66,21 @@ bool CharacterIterator::RewindToUtf8(int desired_utf8_index) { return false; } // Need to work backwards. - UChar32 uchar32 = cached_current_char_; while (utf8_index_ > desired_utf8_index) { - int utf8_index = utf8_index_ - 1; - utf8_index = GetUTF8StartPosition(text_, utf8_index); - if (utf8_index < 0) { + --utf8_index_; + utf8_index_ = GetUTF8StartPosition(text_, utf8_index_); + if (utf8_index_ < 0) { // Somehow, there wasn't a single UTF-8 lead byte at // requested_byte_index or an earlier byte. - cached_current_char_ = i18n_utils::kInvalidUChar32; return false; } // We've found the start of a unicode char! - uchar32 = - i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index); - int expected_length = utf8_index_ - utf8_index; - if (uchar32 == i18n_utils::kInvalidUChar32 || - expected_length != i18n_utils::GetUtf8Length(uchar32)) { - // Either unable to retrieve a valid UTF-32 character at the previous - // position or we skipped past an invalid sequence while seeking the - // previous start position. - cached_current_char_ = i18n_utils::kInvalidUChar32; + UChar32 uchar32 = + i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); + if (uchar32 == i18n_utils::kInvalidUChar32) { + // Unable to retrieve a valid UTF-32 character at the previous position. return false; } - cached_current_char_ = uchar32; - utf8_index_ = utf8_index; utf16_index_ -= i18n_utils::GetUtf16Length(uchar32); --utf32_index_; } @@ -122,15 +94,11 @@ bool CharacterIterator::MoveToUtf16(int desired_utf16_index) { } bool CharacterIterator::AdvanceToUtf16(int desired_utf16_index) { - ResetToStartIfNecessary(); - - UChar32 uchar32 = cached_current_char_; while (utf16_index_ < desired_utf16_index) { - uchar32 = + UChar32 uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); if (uchar32 == i18n_utils::kInvalidUChar32) { // Unable to retrieve a valid UTF-32 character at the previous position. - cached_current_char_ = i18n_utils::kInvalidUChar32; return false; } int utf16_length = i18n_utils::GetUtf16Length(uchar32); @@ -141,15 +109,12 @@ bool CharacterIterator::AdvanceToUtf16(int desired_utf16_index) { int utf8_length = i18n_utils::GetUtf8Length(uchar32); if (utf8_index_ + utf8_length > text_.length()) { // Enforce the requirement. - cached_current_char_ = i18n_utils::kInvalidUChar32; return false; } utf8_index_ += utf8_length; utf16_index_ += utf16_length; ++utf32_index_; } - cached_current_char_ = - i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); return true; } @@ -157,30 +122,21 @@ bool CharacterIterator::RewindToUtf16(int desired_utf16_index) { if (desired_utf16_index < 0) { return false; } - UChar32 uchar32 = cached_current_char_; while (utf16_index_ > desired_utf16_index) { - int utf8_index = utf8_index_ - 1; - utf8_index = GetUTF8StartPosition(text_, utf8_index); - if (utf8_index < 0) { + --utf8_index_; + utf8_index_ = GetUTF8StartPosition(text_, utf8_index_); + if (utf8_index_ < 0) { // Somehow, there wasn't a single UTF-8 lead byte at // requested_byte_index or an earlier byte. - cached_current_char_ = i18n_utils::kInvalidUChar32; return false; } // We've found the start of a unicode char! - uchar32 = - i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index); - int expected_length = utf8_index_ - utf8_index; - if (uchar32 == i18n_utils::kInvalidUChar32 || - expected_length != i18n_utils::GetUtf8Length(uchar32)) { - // Either unable to retrieve a valid UTF-32 character at the previous - // position or we skipped past an invalid sequence while seeking the - // previous start position. - cached_current_char_ = i18n_utils::kInvalidUChar32; + UChar32 uchar32 = + i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); + if (uchar32 == i18n_utils::kInvalidUChar32) { + // Unable to retrieve a valid UTF-32 character at the previous position. return false; } - cached_current_char_ = uchar32; - utf8_index_ = utf8_index; utf16_index_ -= i18n_utils::GetUtf16Length(uchar32); --utf32_index_; } @@ -194,30 +150,23 @@ bool CharacterIterator::MoveToUtf32(int desired_utf32_index) { } bool CharacterIterator::AdvanceToUtf32(int desired_utf32_index) { - ResetToStartIfNecessary(); - - UChar32 uchar32 = cached_current_char_; while (utf32_index_ < desired_utf32_index) { - uchar32 = + UChar32 uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); if (uchar32 == i18n_utils::kInvalidUChar32) { // Unable to retrieve a valid UTF-32 character at the previous position. - cached_current_char_ = i18n_utils::kInvalidUChar32; return false; } int utf16_length = i18n_utils::GetUtf16Length(uchar32); int utf8_length = i18n_utils::GetUtf8Length(uchar32); if (utf8_index_ + utf8_length > text_.length()) { // Enforce the requirement. - cached_current_char_ = i18n_utils::kInvalidUChar32; return false; } utf8_index_ += utf8_length; utf16_index_ += utf16_length; ++utf32_index_; } - cached_current_char_ = - i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); return true; } @@ -225,45 +174,26 @@ bool CharacterIterator::RewindToUtf32(int desired_utf32_index) { if (desired_utf32_index < 0) { return false; } - UChar32 uchar32 = cached_current_char_; while (utf32_index_ > desired_utf32_index) { - int utf8_index = utf8_index_ - 1; - utf8_index = GetUTF8StartPosition(text_, utf8_index); - if (utf8_index < 0) { + --utf8_index_; + utf8_index_ = GetUTF8StartPosition(text_, utf8_index_); + if (utf8_index_ < 0) { // Somehow, there wasn't a single UTF-8 lead byte at // requested_byte_index or an earlier byte. - cached_current_char_ = i18n_utils::kInvalidUChar32; return false; } // We've found the start of a unicode char! - uchar32 = - i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index); - int expected_length = utf8_index_ - utf8_index; - if (uchar32 == i18n_utils::kInvalidUChar32 || - expected_length != i18n_utils::GetUtf8Length(uchar32)) { - // Either unable to retrieve a valid UTF-32 character at the previous - // position or we skipped past an invalid sequence while seeking the - // previous start position. - cached_current_char_ = i18n_utils::kInvalidUChar32; + UChar32 uchar32 = + i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); + if (uchar32 == i18n_utils::kInvalidUChar32) { + // Unable to retrieve a valid UTF-32 character at the previous position. return false; } - cached_current_char_ = uchar32; - utf8_index_ = utf8_index; utf16_index_ -= i18n_utils::GetUtf16Length(uchar32); --utf32_index_; } return true; } -void CharacterIterator::ResetToStartIfNecessary() { - if (utf8_index_ < 0 || utf16_index_ < 0 || utf32_index_ < 0) { - utf8_index_ = 0; - utf16_index_ = 0; - utf32_index_ = 0; - cached_current_char_ = - i18n_utils::GetUChar32At(text_.data(), text_.length(), 0); - } -} - } // namespace lib } // namespace icing diff --git a/icing/util/character-iterator.h b/icing/util/character-iterator.h index 893718a..9df7bee 100644 --- a/icing/util/character-iterator.h +++ b/icing/util/character-iterator.h @@ -29,15 +29,10 @@ class CharacterIterator { CharacterIterator(std::string_view text, int utf8_index, int utf16_index, int utf32_index) : text_(text), - cached_current_char_(i18n_utils::kInvalidUChar32), utf8_index_(utf8_index), utf16_index_(utf16_index), utf32_index_(utf32_index) {} - // Returns the character that the iterator currently points to. - // i18n_utils::kInvalidUChar32 if unable to read that character. - UChar32 GetCurrentChar(); - // Moves current position to desired_utf8_index. // REQUIRES: 0 <= desired_utf8_index <= text_.length() bool MoveToUtf8(int desired_utf8_index); @@ -87,8 +82,6 @@ class CharacterIterator { int utf32_index() const { return utf32_index_; } bool operator==(const CharacterIterator& rhs) const { - // cached_current_char_ is just that: a cached value. As such, it's not - // considered for equality. return text_ == rhs.text_ && utf8_index_ == rhs.utf8_index_ && utf16_index_ == rhs.utf16_index_ && utf32_index_ == rhs.utf32_index_; } @@ -99,12 +92,7 @@ class CharacterIterator { } private: - // Resets the character iterator to the start of the text if any of the - // indices are negative. - void ResetToStartIfNecessary(); - std::string_view text_; - UChar32 cached_current_char_; int utf8_index_; int utf16_index_; int utf32_index_; diff --git a/icing/util/character-iterator_test.cc b/icing/util/character-iterator_test.cc deleted file mode 100644 index 195a47b..0000000 --- a/icing/util/character-iterator_test.cc +++ /dev/null @@ -1,266 +0,0 @@ -// Copyright (C) 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "icing/util/character-iterator.h" - -#include "gmock/gmock.h" -#include "gtest/gtest.h" -#include "icing/testing/icu-i18n-test-utils.h" - -namespace icing { -namespace lib { - -using ::testing::Eq; -using ::testing::IsFalse; -using ::testing::IsTrue; - -TEST(CharacterIteratorTest, BasicUtf8) { - constexpr std::string_view kText = "¿Dónde está la biblioteca?"; - CharacterIterator iterator(kText); - EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿")); - - EXPECT_THAT(iterator.AdvanceToUtf8(4), IsTrue()); - EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó")); - EXPECT_THAT(iterator, - Eq(CharacterIterator(kText, /*utf8_index=*/3, /*utf16_index=*/2, - /*utf32_index=*/2))); - - EXPECT_THAT(iterator.AdvanceToUtf8(18), IsTrue()); - EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b")); - EXPECT_THAT(iterator, - Eq(CharacterIterator(kText, /*utf8_index=*/18, /*utf16_index=*/15, - /*utf32_index=*/15))); - - EXPECT_THAT(iterator.AdvanceToUtf8(28), IsTrue()); - EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?")); - EXPECT_THAT(iterator, - Eq(CharacterIterator(kText, /*utf8_index=*/28, /*utf16_index=*/25, - /*utf32_index=*/25))); - - EXPECT_THAT(iterator.AdvanceToUtf8(29), IsTrue()); - EXPECT_THAT(iterator.GetCurrentChar(), Eq(0)); - EXPECT_THAT(iterator, - Eq(CharacterIterator(kText, /*utf8_index=*/29, /*utf16_index=*/26, - /*utf32_index=*/26))); - - EXPECT_THAT(iterator.RewindToUtf8(28), IsTrue()); - EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?")); - EXPECT_THAT(iterator, - Eq(CharacterIterator(kText, /*utf8_index=*/28, /*utf16_index=*/25, - /*utf32_index=*/25))); - - EXPECT_THAT(iterator.RewindToUtf8(18), IsTrue()); - EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b")); - EXPECT_THAT(iterator, - Eq(CharacterIterator(kText, /*utf8_index=*/18, /*utf16_index=*/15, - /*utf32_index=*/15))); - - EXPECT_THAT(iterator.RewindToUtf8(4), IsTrue()); - EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó")); - EXPECT_THAT(iterator, - Eq(CharacterIterator(kText, /*utf8_index=*/3, /*utf16_index=*/2, - /*utf32_index=*/2))); - - EXPECT_THAT(iterator.RewindToUtf8(0), IsTrue()); - EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿")); - EXPECT_THAT(iterator, - Eq(CharacterIterator(kText, /*utf8_index=*/0, /*utf16_index=*/0, - /*utf32_index=*/0))); -} - -TEST(CharacterIteratorTest, BasicUtf16) { - constexpr std::string_view kText = "¿Dónde está la biblioteca?"; - CharacterIterator iterator(kText); - EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿")); - - EXPECT_THAT(iterator.AdvanceToUtf16(2), IsTrue()); - EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó")); - EXPECT_THAT(iterator, - Eq(CharacterIterator(kText, /*utf8_index=*/3, /*utf16_index=*/2, - /*utf32_index=*/2))); - - EXPECT_THAT(iterator.AdvanceToUtf16(15), IsTrue()); - EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b")); - EXPECT_THAT(iterator, - Eq(CharacterIterator(kText, /*utf8_index=*/18, /*utf16_index=*/15, - /*utf32_index=*/15))); - - EXPECT_THAT(iterator.AdvanceToUtf16(25), IsTrue()); - EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?")); - EXPECT_THAT(iterator, - Eq(CharacterIterator(kText, /*utf8_index=*/28, /*utf16_index=*/25, - /*utf32_index=*/25))); - - EXPECT_THAT(iterator.AdvanceToUtf16(26), IsTrue()); - EXPECT_THAT(iterator.GetCurrentChar(), Eq(0)); - EXPECT_THAT(iterator, - Eq(CharacterIterator(kText, /*utf8_index=*/29, /*utf16_index=*/26, - /*utf32_index=*/26))); - - EXPECT_THAT(iterator.RewindToUtf16(25), IsTrue()); - EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?")); - EXPECT_THAT(iterator, - Eq(CharacterIterator(kText, /*utf8_index=*/28, /*utf16_index=*/25, - /*utf32_index=*/25))); - - EXPECT_THAT(iterator.RewindToUtf16(15), IsTrue()); - EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b")); - EXPECT_THAT(iterator, - Eq(CharacterIterator(kText, /*utf8_index=*/18, /*utf16_index=*/15, - /*utf32_index=*/15))); - - EXPECT_THAT(iterator.RewindToUtf16(2), IsTrue()); - EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó")); - EXPECT_THAT(iterator, - Eq(CharacterIterator(kText, /*utf8_index=*/3, /*utf16_index=*/2, - /*utf32_index=*/2))); - - EXPECT_THAT(iterator.RewindToUtf8(0), IsTrue()); - EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿")); - EXPECT_THAT(iterator, - Eq(CharacterIterator(kText, /*utf8_index=*/0, /*utf16_index=*/0, - /*utf32_index=*/0))); -} - -TEST(CharacterIteratorTest, BasicUtf32) { - constexpr std::string_view kText = "¿Dónde está la biblioteca?"; - CharacterIterator iterator(kText); - EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿")); - - EXPECT_THAT(iterator.AdvanceToUtf32(2), IsTrue()); - EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó")); - EXPECT_THAT(iterator, - Eq(CharacterIterator(kText, /*utf8_index=*/3, /*utf16_index=*/2, - /*utf32_index=*/2))); - - EXPECT_THAT(iterator.AdvanceToUtf32(15), IsTrue()); - EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b")); - EXPECT_THAT(iterator, - Eq(CharacterIterator(kText, /*utf8_index=*/18, /*utf16_index=*/15, - /*utf32_index=*/15))); - - EXPECT_THAT(iterator.AdvanceToUtf32(25), IsTrue()); - EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?")); - EXPECT_THAT(iterator, - Eq(CharacterIterator(kText, /*utf8_index=*/28, /*utf16_index=*/25, - /*utf32_index=*/25))); - - EXPECT_THAT(iterator.AdvanceToUtf32(26), IsTrue()); - EXPECT_THAT(iterator.GetCurrentChar(), Eq(0)); - EXPECT_THAT(iterator, - Eq(CharacterIterator(kText, /*utf8_index=*/29, /*utf16_index=*/26, - /*utf32_index=*/26))); - - EXPECT_THAT(iterator.RewindToUtf32(25), IsTrue()); - EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?")); - EXPECT_THAT(iterator, - Eq(CharacterIterator(kText, /*utf8_index=*/28, /*utf16_index=*/25, - /*utf32_index=*/25))); - - EXPECT_THAT(iterator.RewindToUtf32(15), IsTrue()); - EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b")); - EXPECT_THAT(iterator, - Eq(CharacterIterator(kText, /*utf8_index=*/18, /*utf16_index=*/15, - /*utf32_index=*/15))); - - EXPECT_THAT(iterator.RewindToUtf32(2), IsTrue()); - EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó")); - EXPECT_THAT(iterator, - Eq(CharacterIterator(kText, /*utf8_index=*/3, /*utf16_index=*/2, - /*utf32_index=*/2))); - - EXPECT_THAT(iterator.RewindToUtf32(0), IsTrue()); - EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿")); - EXPECT_THAT(iterator, - Eq(CharacterIterator(kText, /*utf8_index=*/0, /*utf16_index=*/0, - /*utf32_index=*/0))); -} - -TEST(CharacterIteratorTest, InvalidUtf) { - // "\255" is an invalid sequence. - constexpr std::string_view kText = "foo \255 bar"; - CharacterIterator iterator(kText); - - // Try to advance to the 'b' in 'bar'. This will fail and leave us pointed at - // the invalid sequence '\255'. Get CurrentChar() should return an invalid - // character. - EXPECT_THAT(iterator.AdvanceToUtf8(6), IsFalse()); - EXPECT_THAT(iterator.GetCurrentChar(), Eq(i18n_utils::kInvalidUChar32)); - CharacterIterator exp_iterator(kText, /*utf8_index=*/4, /*utf16_index=*/4, - /*utf32_index=*/4); - EXPECT_THAT(iterator, Eq(exp_iterator)); - - EXPECT_THAT(iterator.AdvanceToUtf16(6), IsFalse()); - EXPECT_THAT(iterator.GetCurrentChar(), Eq(i18n_utils::kInvalidUChar32)); - EXPECT_THAT(iterator, Eq(exp_iterator)); - - EXPECT_THAT(iterator.AdvanceToUtf32(6), IsFalse()); - EXPECT_THAT(iterator.GetCurrentChar(), Eq(i18n_utils::kInvalidUChar32)); - EXPECT_THAT(iterator, Eq(exp_iterator)); - - // Create the iterator with it pointing at the 'b' in 'bar'. - iterator = CharacterIterator(kText, /*utf8_index=*/6, /*utf16_index=*/6, - /*utf32_index=*/6); - EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b")); - - // Try to advance to the last 'o' in 'foo'. This will fail and leave us - // pointed at the ' ' before the invalid sequence '\255'. - exp_iterator = CharacterIterator(kText, /*utf8_index=*/5, /*utf16_index=*/5, - /*utf32_index=*/5); - EXPECT_THAT(iterator.RewindToUtf8(2), IsFalse()); - EXPECT_THAT(iterator.GetCurrentChar(), Eq(' ')); - EXPECT_THAT(iterator, Eq(exp_iterator)); - - EXPECT_THAT(iterator.RewindToUtf16(2), IsFalse()); - EXPECT_THAT(iterator.GetCurrentChar(), Eq(' ')); - EXPECT_THAT(iterator, Eq(exp_iterator)); - - EXPECT_THAT(iterator.RewindToUtf32(2), IsFalse()); - EXPECT_THAT(iterator.GetCurrentChar(), Eq(' ')); - EXPECT_THAT(iterator, Eq(exp_iterator)); -} - -TEST(CharacterIteratorTest, MoveToUtfNegativeIndex) { - constexpr std::string_view kText = "¿Dónde está la biblioteca?"; - - CharacterIterator iterator_utf8(kText, /*utf8_index=*/-1, /*utf16_index=*/0, - /*utf32_index=*/0); - // We should be able to successfully move when the index is negative. - EXPECT_THAT(iterator_utf8.MoveToUtf8(0), IsTrue()); - // The character cache should be reset and contain the first character when - // resetting to index 0. - EXPECT_THAT(UCharToString(iterator_utf8.GetCurrentChar()), Eq("¿")); - EXPECT_THAT(iterator_utf8.utf8_index(), Eq(0)); - EXPECT_THAT(iterator_utf8.utf16_index(), Eq(0)); - EXPECT_THAT(iterator_utf8.utf32_index(), Eq(0)); - - CharacterIterator iterator_utf16(kText, /*utf8_index=*/0, /*utf16_index=*/-1, - /*utf32_index=*/0); - EXPECT_THAT(iterator_utf16.MoveToUtf16(1), IsTrue()); - EXPECT_THAT(iterator_utf16.GetCurrentChar(), Eq('D')); - EXPECT_THAT(iterator_utf16.utf8_index(), Eq(2)); - EXPECT_THAT(iterator_utf16.utf16_index(), Eq(1)); - EXPECT_THAT(iterator_utf16.utf32_index(), Eq(1)); - - CharacterIterator iterator_utf32(kText, /*utf8_index=*/0, /*utf16_index=*/0, - /*utf32_index=*/-1); - EXPECT_THAT(iterator_utf32.MoveToUtf32(2), IsTrue()); - EXPECT_THAT(UCharToString(iterator_utf32.GetCurrentChar()), Eq("ó")); - EXPECT_THAT(iterator_utf32.utf8_index(), Eq(3)); - EXPECT_THAT(iterator_utf32.utf16_index(), Eq(2)); - EXPECT_THAT(iterator_utf32.utf32_index(), Eq(2)); -} - -} // namespace lib -} // namespace icing diff --git a/icing/util/document-validator_test.cc b/icing/util/document-validator_test.cc index 45c23e0..cb013d7 100644 --- a/icing/util/document-validator_test.cc +++ b/icing/util/document-validator_test.cc @@ -46,15 +46,15 @@ constexpr char kPropertyEmails[] = "emails"; constexpr char kDefaultNamespace[] = "icing"; constexpr char kDefaultString[] = "This is a string."; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = - PropertyConfigProto::Cardinality::OPTIONAL; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED = - PropertyConfigProto::Cardinality::REQUIRED; -constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED = - PropertyConfigProto::Cardinality::REPEATED; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = + PropertyConfigProto_Cardinality_Code_OPTIONAL; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED = + PropertyConfigProto_Cardinality_Code_REQUIRED; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED = + PropertyConfigProto_Cardinality_Code_REPEATED; -constexpr PropertyConfigProto::DataType::Code TYPE_STRING = - PropertyConfigProto::DataType::STRING; +constexpr PropertyConfigProto_DataType_Code TYPE_STRING = + PropertyConfigProto_DataType_Code_STRING; class DocumentValidatorTest : public ::testing::Test { protected: @@ -93,11 +93,9 @@ class DocumentValidatorTest : public ::testing::Test { .SetCardinality(CARDINALITY_REPEATED))) .Build(); - schema_dir_ = GetTestTempDir() + "/schema_store"; - ASSERT_TRUE(filesystem_.CreateDirectory(schema_dir_.c_str())); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, schema_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, GetTestTempDir(), &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); document_validator_ = @@ -124,7 +122,6 @@ class DocumentValidatorTest : public ::testing::Test { SimpleEmailBuilder().Build()); } - std::string schema_dir_; std::unique_ptr<DocumentValidator> document_validator_; std::unique_ptr<SchemaStore> schema_store_; Filesystem filesystem_; diff --git a/icing/util/i18n-utils.cc b/icing/util/i18n-utils.cc index ec327ad..cd0a227 100644 --- a/icing/util/i18n-utils.cc +++ b/icing/util/i18n-utils.cc @@ -116,8 +116,6 @@ bool IsAscii(char c) { return U8_IS_SINGLE((uint8_t)c); } bool IsAscii(UChar32 c) { return U8_LENGTH(c) == 1; } -bool IsAlphaNumeric(UChar32 c) { return u_isalnum(c); } - int GetUtf8Length(UChar32 c) { return U8_LENGTH(c); } int GetUtf16Length(UChar32 c) { return U16_LENGTH(c); } diff --git a/icing/util/i18n-utils.h b/icing/util/i18n-utils.h index 491df6b..82ae828 100644 --- a/icing/util/i18n-utils.h +++ b/icing/util/i18n-utils.h @@ -67,9 +67,6 @@ bool IsAscii(char c); // Checks if the Unicode char is within ASCII range. bool IsAscii(UChar32 c); -// Checks if the Unicode char is alphanumeric. -bool IsAlphaNumeric(UChar32 c); - // Returns how many code units (char) are used for the UTF-8 encoding of this // Unicode character. Returns 0 if not valid. int GetUtf8Length(UChar32 c); diff --git a/java/Android.bp b/java/Android.bp index 6133230..ef417ba 100644 --- a/java/Android.bp +++ b/java/Android.bp @@ -32,6 +32,5 @@ java_library { "androidx.annotation_annotation", ], sdk_version: "current", - min_sdk_version: "Tiramisu", apex_available: ["com.android.appsearch"], } diff --git a/java/src/com/google/android/icing/IcingSearchEngine.java b/java/src/com/google/android/icing/IcingSearchEngine.java index 95e0c84..1f5fb51 100644 --- a/java/src/com/google/android/icing/IcingSearchEngine.java +++ b/java/src/com/google/android/icing/IcingSearchEngine.java @@ -43,8 +43,6 @@ import com.google.android.icing.proto.SearchSpecProto; import com.google.android.icing.proto.SetSchemaResultProto; import com.google.android.icing.proto.StatusProto; import com.google.android.icing.proto.StorageInfoResultProto; -import com.google.android.icing.proto.SuggestionResponse; -import com.google.android.icing.proto.SuggestionSpecProto; import com.google.android.icing.proto.UsageReport; import com.google.protobuf.ExtensionRegistryLite; import com.google.protobuf.InvalidProtocolBufferException; @@ -372,26 +370,6 @@ public class IcingSearchEngine implements Closeable { } @NonNull - public SuggestionResponse searchSuggestions(@NonNull SuggestionSpecProto suggestionSpec) { - byte[] suggestionResponseBytes = nativeSearchSuggestions(this, suggestionSpec.toByteArray()); - if (suggestionResponseBytes == null) { - Log.e(TAG, "Received null suggestionResponseBytes from native."); - return SuggestionResponse.newBuilder() - .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL)) - .build(); - } - - try { - return SuggestionResponse.parseFrom(suggestionResponseBytes, EXTENSION_REGISTRY_LITE); - } catch (InvalidProtocolBufferException e) { - Log.e(TAG, "Error parsing suggestionResponseBytes.", e); - return SuggestionResponse.newBuilder() - .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL)) - .build(); - } - } - - @NonNull public DeleteByNamespaceResultProto deleteByNamespace(@NonNull String namespace) { throwIfClosed(); @@ -626,7 +604,4 @@ public class IcingSearchEngine implements Closeable { private static native byte[] nativeGetStorageInfo(IcingSearchEngine instance); private static native byte[] nativeReset(IcingSearchEngine instance); - - private static native byte[] nativeSearchSuggestions( - IcingSearchEngine instance, byte[] suggestionSpecBytes); } diff --git a/java/tests/instrumentation/src/androidx/appsearch/smoketest/AndroidXSmokeTest.java b/java/tests/instrumentation/src/androidx/appsearch/smoketest/AppSearchSmokeTest.java index 98b1b25..8fae104 100644 --- a/java/tests/instrumentation/src/androidx/appsearch/smoketest/AndroidXSmokeTest.java +++ b/java/tests/instrumentation/src/androidx/appsearch/smoketest/AppSearchSmokeTest.java @@ -24,7 +24,6 @@ import androidx.appsearch.app.AppSearchSchema; import androidx.appsearch.app.AppSearchSchema.PropertyConfig; import androidx.appsearch.app.AppSearchSchema.StringPropertyConfig; import androidx.appsearch.app.AppSearchSession; -import androidx.appsearch.app.GenericDocument; import androidx.appsearch.app.PutDocumentsRequest; import androidx.appsearch.app.SearchResult; import androidx.appsearch.app.SearchResults; @@ -33,16 +32,15 @@ import androidx.appsearch.app.SetSchemaRequest; import androidx.appsearch.localstorage.LocalStorage; import androidx.appsearch.localstorage.LocalStorage.SearchContext; import androidx.test.core.app.ApplicationProvider; -import androidx.test.ext.junit.runners.AndroidJUnit4; +import androidx.test.filters.SmallTest; import org.junit.Before; import org.junit.Test; -import org.junit.runner.RunWith; import java.util.List; -@RunWith(AndroidJUnit4.class) -public class AndroidXSmokeTest { +@SmallTest +public class AppSearchSmokeTest { private AppSearchSession appSearch; @Before @@ -50,8 +48,7 @@ public class AndroidXSmokeTest { appSearch = LocalStorage.createSearchSession( new SearchContext.Builder( - ApplicationProvider.getApplicationContext(), - "database") + ApplicationProvider.getApplicationContext()) .build()) .get(); // Remove all data before test @@ -82,7 +79,7 @@ public class AndroidXSmokeTest { .build()) .get(); - TestDocument input = new TestDocument("namespace", "id1", "avocado"); + TestDocument input = new TestDocument("uri1", "avocado"); appSearch .put(new PutDocumentsRequest.Builder().addDocuments(input).build()) .get() @@ -98,11 +95,10 @@ public class AndroidXSmokeTest { SearchResult result = page.get(0); assertThat(results.getNextPage().get()).isEmpty(); - GenericDocument genericOutput = result.getGenericDocument(); - assertEquals("id1", genericOutput.getId()); - assertEquals("avocado", genericOutput.getPropertyString("body")); - TestDocument output = genericOutput.toDocumentClass(TestDocument.class); - assertEquals("id1", output.getId()); + assertEquals("uri1", result.getDocument().getUri()); + assertEquals("avocado", result.getDocument().getPropertyString("body")); + TestDocument output = result.getDocument().toDocumentClass(TestDocument.class); + assertEquals("uri1", output.getUri()); assertEquals("avocado", output.getBody()); } } diff --git a/java/tests/instrumentation/src/androidx/appsearch/smoketest/TestDocument.java b/java/tests/instrumentation/src/androidx/appsearch/smoketest/TestDocument.java index ebf32e4..089ff55 100644 --- a/java/tests/instrumentation/src/androidx/appsearch/smoketest/TestDocument.java +++ b/java/tests/instrumentation/src/androidx/appsearch/smoketest/TestDocument.java @@ -21,28 +21,21 @@ import androidx.appsearch.app.AppSearchSchema.StringPropertyConfig; @Document public class TestDocument { - @Document.Namespace private final String mNamespace; + @Document.Uri private final String uri; - @Document.Id private final String mId; + @Document.Property(indexingType = StringPropertyConfig.INDEXING_TYPE_PREFIXES) + private final String body; - @Document.StringProperty(indexingType = StringPropertyConfig.INDEXING_TYPE_PREFIXES) - private final String mBody; - - TestDocument(String namespace, String id, String body) { - mNamespace = namespace; - mId = id; - mBody = body; - } - - public String getNamespace() { - return mNamespace; + TestDocument(String uri, String body) { + this.uri = uri; + this.body = body; } - public String getId() { - return mId; + public String getUri() { + return uri; } public String getBody() { - return mBody; + return body; } } diff --git a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java index a46814c..64f98f6 100644 --- a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java +++ b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java @@ -51,11 +51,7 @@ import com.google.android.icing.proto.StatusProto; import com.google.android.icing.proto.StorageInfoResultProto; import com.google.android.icing.proto.StringIndexingConfig; import com.google.android.icing.proto.StringIndexingConfig.TokenizerType; -import com.google.android.icing.proto.SuggestionResponse; -import com.google.android.icing.proto.SuggestionSpecProto; -import com.google.android.icing.proto.SuggestionSpecProto.SuggestionScoringSpecProto; import com.google.android.icing.proto.TermMatchType; -import com.google.android.icing.proto.TermMatchType.Code; import com.google.android.icing.proto.UsageReport; import com.google.android.icing.IcingSearchEngine; import java.io.File; @@ -63,6 +59,7 @@ import java.util.HashMap; import java.util.Map; import org.junit.After; import org.junit.Before; +import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; @@ -493,6 +490,7 @@ public final class IcingSearchEngineTest { } @Test + @Ignore("b/190845688") public void testCJKTSnippets() throws Exception { assertStatusOk(icingSearchEngine.initialize().getStatus()); @@ -500,13 +498,12 @@ public final class IcingSearchEngineTest { assertStatusOk( icingSearchEngine.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false).getStatus()); - // String: "天是蓝的" - // ^ ^^ ^ - // UTF16 idx: 0 1 2 3 - // Breaks into segments: "天", "是", "蓝", "的" - // "The sky is blue" - String chinese = "天是蓝的"; - assertThat(chinese.length()).isEqualTo(4); + // String: "我每天走路去上班。" + // ^ ^ ^ ^^ + // UTF16 idx: 0 1 3 5 6 + // Breaks into segments: "我", "每天", "走路", "去", "上班" + String chinese = "我每天走路去上班。"; + assertThat(chinese.length()).isEqualTo(9); DocumentProto emailDocument1 = createEmailDocument("namespace", "uri1").toBuilder() .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues(chinese)) @@ -516,7 +513,7 @@ public final class IcingSearchEngineTest { // Search and request snippet matching but no windowing. SearchSpecProto searchSpec = SearchSpecProto.newBuilder() - .setQuery("是") + .setQuery("每") .setTermMatchType(TermMatchType.Code.PREFIX) .build(); ResultSpecProto resultSpecProto = @@ -555,9 +552,9 @@ public final class IcingSearchEngineTest { int matchStart = matchProto.getExactMatchUtf16Position(); int matchEnd = matchStart + matchProto.getExactMatchUtf16Length(); assertThat(matchStart).isEqualTo(1); - assertThat(matchEnd).isEqualTo(2); + assertThat(matchEnd).isEqualTo(3); String match = content.substring(matchStart, matchEnd); - assertThat(match).isEqualTo("是"); + assertThat(match).isEqualTo("每天"); } @Test @@ -627,47 +624,6 @@ public final class IcingSearchEngineTest { assertThat(match).isEqualTo("𐀂𐀃"); } - @Test - public void testSearchSuggestions() { - assertStatusOk(icingSearchEngine.initialize().getStatus()); - - SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig(); - SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build(); - assertThat( - icingSearchEngine - .setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false) - .getStatus() - .getCode()) - .isEqualTo(StatusProto.Code.OK); - - DocumentProto emailDocument1 = - createEmailDocument("namespace", "uri1").toBuilder() - .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues("fo")) - .build(); - DocumentProto emailDocument2 = - createEmailDocument("namespace", "uri2").toBuilder() - .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues("foo")) - .build(); - assertStatusOk(icingSearchEngine.put(emailDocument1).getStatus()); - assertStatusOk(icingSearchEngine.put(emailDocument2).getStatus()); - - SuggestionSpecProto suggestionSpec = - SuggestionSpecProto.newBuilder() - .setPrefix("f") - .setNumToReturn(10) - .setScoringSpec( - SuggestionScoringSpecProto.newBuilder() - .setScoringMatchType(Code.EXACT_ONLY) - .build()) - .build(); - - SuggestionResponse response = icingSearchEngine.searchSuggestions(suggestionSpec); - assertStatusOk(response.getStatus()); - assertThat(response.getSuggestionsList()).hasSize(2); - assertThat(response.getSuggestions(0).getQuery()).isEqualTo("foo"); - assertThat(response.getSuggestions(1).getQuery()).isEqualTo("fo"); - } - private static void assertStatusOk(StatusProto status) { assertWithMessage(status.getMessage()).that(status.getCode()).isEqualTo(StatusProto.Code.OK); } diff --git a/proto/Android.bp b/proto/Android.bp index cda0ec2..4fb0c18 100644 --- a/proto/Android.bp +++ b/proto/Android.bp @@ -43,5 +43,4 @@ cc_library_static { export_proto_headers: true, }, srcs: ["icing/**/*.proto"], - min_sdk_version: "Tiramisu", } diff --git a/proto/icing/proto/debug.proto b/proto/icing/proto/debug.proto deleted file mode 100644 index 504ae43..0000000 --- a/proto/icing/proto/debug.proto +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright 2022 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -syntax = "proto2"; - -package icing.lib; - -import "icing/proto/schema.proto"; -import "icing/proto/status.proto"; -import "icing/proto/storage.proto"; - -option java_package = "com.google.android.icing.proto"; -option java_multiple_files = true; -option objc_class_prefix = "ICNG"; - -// Next tag: 4 -message IndexDebugInfoProto { - // Storage information of the index. - optional IndexStorageInfoProto index_storage_info = 1; - - message MainIndexDebugInfoProto { - // Information about the main lexicon. - // TODO(b/222349894) Convert the string output to a protocol buffer instead. - optional string lexicon_info = 1; - - // Last added document id. - optional uint32 last_added_document_id = 2; - - // If verbosity > 0, return information about the posting list storage. - // TODO(b/222349894) Convert the string output to a protocol buffer instead. - optional string flash_index_storage_info = 3; - } - optional MainIndexDebugInfoProto main_index_info = 2; - - message LiteIndexDebugInfoProto { - // Current number of hits. - optional uint32 curr_size = 1; - - // The maximum possible number of hits. - optional uint32 hit_buffer_size = 2; - - // Last added document id. - optional uint32 last_added_document_id = 3; - - // The first position in the hit buffer that is not sorted yet, - // or curr_size if all hits are sorted. - optional uint32 searchable_end = 4; - - // The most recent checksum of the lite index, by calling - // LiteIndex::ComputeChecksum(). - optional uint32 index_crc = 5; - - // Information about the lite lexicon. - // TODO(b/222349894) Convert the string output to a protocol buffer instead. - optional string lexicon_info = 6; - } - optional LiteIndexDebugInfoProto lite_index_info = 3; -} - -// Next tag: 4 -message DocumentDebugInfoProto { - // Storage information of the document store. - optional DocumentStorageInfoProto document_storage_info = 1; - - // The most recent checksum of the document store, by calling - // DocumentStore::ComputeChecksum(). - optional uint32 crc = 2; - - message CorpusInfo { - optional string namespace = 1; - optional string schema = 2; - optional uint32 total_documents = 3; - optional uint32 total_token = 4; - } - - // If verbosity > 0, return the total number of documents and tokens in each - // (namespace, schema type) pair. - // Note that deleted and expired documents are skipped in the output. - repeated CorpusInfo corpus_info = 3; -} - -// Next tag: 3 -message SchemaDebugInfoProto { - // Copy of the SchemaProto if it has been set in the schema store. - // Modifying this does not affect the Schema that IcingSearchEngine holds. - optional SchemaProto schema = 1; - - // The most recent checksum of the schema store, by calling - // SchemaStore::ComputeChecksum(). - optional uint32 crc = 2; -} - -// Next tag: 4 -message DebugInfoProto { - // Debug information of the index. - optional IndexDebugInfoProto index_info = 1; - - // Debug information of the document store. - optional DocumentDebugInfoProto document_info = 2; - - // Debug information of the schema store. - optional SchemaDebugInfoProto schema_info = 3; -} - -// Next tag: 3 -message DebugInfoResultProto { - // Status code can be one of: - // OK - // FAILED_PRECONDITION - // - // See status.proto for more details. - optional StatusProto status = 1; - - // Debug information for Icing. - optional DebugInfoProto debug_info = 2; -} diff --git a/proto/icing/proto/document.proto b/proto/icing/proto/document.proto index 1a501e7..9a4e5b9 100644 --- a/proto/icing/proto/document.proto +++ b/proto/icing/proto/document.proto @@ -209,7 +209,7 @@ message DeleteBySchemaTypeResultProto { } // Result of a call to IcingSearchEngine.DeleteByQuery -// Next tag: 5 +// Next tag: 3 message DeleteByQueryResultProto { // Status code can be one of: // OK @@ -224,20 +224,5 @@ message DeleteByQueryResultProto { optional StatusProto status = 1; // Stats for delete execution performance. - optional DeleteByQueryStatsProto delete_by_query_stats = 3; - - // Used by DeleteByQueryResultProto to return information about deleted - // documents. - message DocumentGroupInfo { - optional string namespace = 1; - optional string schema = 2; - repeated string uris = 3; - } - - // Additional return message that shows the uris of the deleted documents, if - // users set return_deleted_document_info to true. - // The result is grouped by the corresponding namespace and type. - repeated DocumentGroupInfo deleted_documents = 4; - - reserved 2; + optional DeleteStatsProto delete_stats = 2; } diff --git a/proto/icing/proto/initialize.proto b/proto/icing/proto/initialize.proto index 7fe1e6f..ab2556d 100644 --- a/proto/icing/proto/initialize.proto +++ b/proto/icing/proto/initialize.proto @@ -30,6 +30,19 @@ message IcingSearchEngineOptions { // the index saved by the last instance. optional string base_dir = 1; + // The maximum number of tokens to be allowed per document. If a document + // exceeds this number of tokens, then only the first max_tokens_per_doc + // will be indexed. + // + // Clients may use this value to prevent the possibility of a select few + // documents from exhausting limits in the index that are shared between all + // documents (ie max allowed index size). + // + // Valid values: [1, INT_MAX], Current default is 1/5 of the default of + // max_document_size. + // Optional. + optional int32 max_tokens_per_doc = 2 [default = 13107]; + // The maximum allowable token length. All tokens in excess of this size // will be truncated to max_token_length before being indexed. // @@ -57,8 +70,6 @@ message IcingSearchEngineOptions { // Valid values: [1, INT_MAX] // Optional. optional int32 index_merge_size = 4 [default = 1048576]; // 1 MiB - - reserved 2; } // Result of a call to IcingSearchEngine.Initialize diff --git a/proto/icing/proto/logging.proto b/proto/icing/proto/logging.proto index 0a7c4a6..29f7f80 100644 --- a/proto/icing/proto/logging.proto +++ b/proto/icing/proto/logging.proto @@ -23,7 +23,7 @@ option java_multiple_files = true; option objc_class_prefix = "ICNG"; // Stats of the top-level function IcingSearchEngine::Initialize(). -// Next tag: 12 +// Next tag: 11 message InitializeStatsProto { // Overall time used for the function call. optional int32 latency_ms = 1; @@ -46,9 +46,6 @@ message InitializeStatsProto { // Random I/O errors. IO_ERROR = 4; - - // The document log is using legacy format. - LEGACY_DOCUMENT_LOG_FORMAT = 5; } // Possible recovery causes for document store: @@ -95,10 +92,6 @@ message InitializeStatsProto { // Number of schema types currently in schema store. optional int32 num_schema_types = 10; - - // Number of consecutive initialization failures that immediately preceded - // this initialization. - optional int32 num_previous_init_failures = 11; } // Stats of the top-level function IcingSearchEngine::Put(). @@ -121,10 +114,12 @@ message PutDocumentStatsProto { optional int32 document_size = 5; message TokenizationStats { + // Whether the number of tokens to be indexed exceeded the max number of + // tokens per document. + optional bool exceeded_max_token_num = 2; + // Number of tokens added to the index. optional int32 num_tokens_indexed = 1; - - reserved 2; } optional TokenizationStats tokenization_stats = 6; } @@ -186,7 +181,8 @@ message QueryStatsProto { } // Stats of the top-level functions IcingSearchEngine::Delete, -// IcingSearchEngine::DeleteByNamespace, IcingSearchEngine::DeleteBySchemaType. +// IcingSearchEngine::DeleteByNamespace, IcingSearchEngine::DeleteBySchemaType, +// IcingSearchEngine::DeleteByQuery. // Next tag: 4 message DeleteStatsProto { // Overall time used for the function call. @@ -200,10 +196,8 @@ message DeleteStatsProto { // Delete one document. SINGLE = 1; - // Delete by query. This value is deprecated. - // IcingSearchEngine::DeleteByQuery will return a DeleteByQueryStatsProto - // rather than a DeleteStatsProto. - DEPRECATED_QUERY = 2 [deprecated = true]; + // Delete by query. + QUERY = 2; // Delete by namespace. NAMESPACE = 3; @@ -217,32 +211,3 @@ message DeleteStatsProto { // Number of documents deleted by this call. optional int32 num_documents_deleted = 3; } - -// Stats of the top-level functions IcingSearchEngine::DeleteByQuery. -// Next tag: 9 -message DeleteByQueryStatsProto { - // Overall time used for the function call. - optional int32 latency_ms = 1; - - // Number of documents deleted by this call. - optional int32 num_documents_deleted = 2; - - // The UTF-8 length of the query string - optional int32 query_length = 3; - - // Number of terms in the query string. - optional int32 num_terms = 4; - - // Number of namespaces filtered. - optional int32 num_namespaces_filtered = 5; - - // Number of schema types filtered. - optional int32 num_schema_types_filtered = 6; - - // Time used to parse the query, including 2 parts: tokenizing and - // transforming tokens into an iterator tree. - optional int32 parse_query_latency_ms = 7; - - // Time used to delete each document. - optional int32 document_removal_latency_ms = 8; -} diff --git a/proto/icing/proto/schema.proto b/proto/icing/proto/schema.proto index ffb6f2c..4188a8c 100644 --- a/proto/icing/proto/schema.proto +++ b/proto/icing/proto/schema.proto @@ -91,14 +91,6 @@ message StringIndexingConfig { // Tokenization for plain text. PLAIN = 1; - - // Tokenizes text in verbatim. This means no normalization or segmentation - // is applied to string values that are tokenized using this type. - // Therefore, the output token is equivalent to the raw string text. For - // example, "Hello, world!" would be tokenized as "Hello, world!" - // preserving punctuation and capitalization, and not creating separate - // tokens between the space. - VERBATIM = 2; } } optional TokenizerType.Code tokenizer_type = 2; @@ -205,7 +197,7 @@ message SchemaProto { } // Result of a call to IcingSearchEngine.SetSchema -// Next tag: 8 +// Next tag: 4 message SetSchemaResultProto { // Status code can be one of: // OK @@ -229,21 +221,6 @@ message SetSchemaResultProto { // documents that fail validation against the new schema types would also be // deleted. repeated string incompatible_schema_types = 3; - - // Schema types that did not exist in the previous schema and were added with - // the new schema type. - repeated string new_schema_types = 4; - - // Schema types that were changed in a way that was backwards compatible and - // didn't invalidate the index. - repeated string fully_compatible_changed_schema_types = 5; - - // Schema types that were changed in a way that was backwards compatible, but - // invalidated the index. - repeated string index_incompatible_changed_schema_types = 6; - - // Overall time used for the function call. - optional int32 latency_ms = 7; } // Result of a call to IcingSearchEngine.GetSchema diff --git a/proto/icing/proto/scoring.proto b/proto/icing/proto/scoring.proto index 71c943e..6186fde 100644 --- a/proto/icing/proto/scoring.proto +++ b/proto/icing/proto/scoring.proto @@ -23,7 +23,7 @@ option objc_class_prefix = "ICNG"; // Encapsulates the configurations on how Icing should score and rank the search // results. // TODO(b/170347684): Change all timestamps to seconds. -// Next tag: 4 +// Next tag: 3 message ScoringSpecProto { // OPTIONAL: Indicates how the search results will be ranked. message RankingStrategy { @@ -83,42 +83,4 @@ message ScoringSpecProto { } } optional Order.Code order_by = 2; - - // OPTIONAL: Specifies property weights for RELEVANCE_SCORE scoring strategy. - // Property weights are used for promoting or demoting query term matches in a - // document property. When property weights are provided, the term frequency - // is multiplied by the normalized property weight when computing the - // normalized term frequency component of BM25F. To prefer query term matches - // in the "subject" property over the "body" property of "Email" documents, - // set a higher property weight value for "subject" than "body". By default, - // all properties that are not specified are given a raw, pre-normalized - // weight of 1.0 when scoring. - repeated TypePropertyWeights type_property_weights = 3; -} - -// Next tag: 3 -message TypePropertyWeights { - // Schema type to apply property weights to. - optional string schema_type = 1; - - // Property weights to apply to the schema type. - repeated PropertyWeight property_weights = 2; -} - -// Next tag: 3 -message PropertyWeight { - // Property path to assign property weight to. Property paths must be composed - // only of property names and property separators (the '.' character). - // For example, if an "Email" schema type has string property "subject" and - // document property "sender", which has string property "name", the property - // path for the email's subject would just be "subject" and the property path - // for the sender's name would be "sender.name". If an invalid path is - // specified, the property weight is discarded. - optional string path = 1; - - // Property weight, valid values are positive and zero. Setting a zero - // property weight will remove scoring contribution for a query term match in - // the property. Negative weights are invalid and will result in an error. - // By default, a property is given a raw, pre-normalized weight of 1.0. - optional double weight = 2; } diff --git a/proto/icing/proto/search.proto b/proto/icing/proto/search.proto index f005c76..66fdbe6 100644 --- a/proto/icing/proto/search.proto +++ b/proto/icing/proto/search.proto @@ -85,16 +85,16 @@ message ResultSpecProto { // have snippet information provided. If set to 0, snippeting is disabled. optional int32 num_matches_per_property = 2; - // How large of a window to provide. Windows start at - // max_window_utf32_length / 2 bytes before the middle of the matching token - // and end at max_window_utf32_length / 2 bytes after the middle of the - // matching token. Windowing respects token boundaries. Therefore, the - // returned window may be smaller than requested. Setting - // max_window_utf32_length to 0 will disable windowing information. If - // matches enabled is also set to false, then snippeting is disabled. Ex. - // max_window_utf32_length = 16. "foo bar baz bat rat" with a query of "baz" + // How large of a window to provide. Windows start at max_window_bytes / 2 + // bytes before the middle of the matching token and end at max_window_bytes + // / 2 bytes after the middle of the matching token. Windowing respects + // token boundaries. + // Therefore, the returned window may be smaller than requested. Setting + // max_window_bytes to 0 will disable windowing information. If matches + // enabled is also set to false, then snippeting is disabled. + // Ex. max_window_bytes = 16. "foo bar baz bat rat" with a query of "baz" // will return a window of "bar baz bat" which is only 11 bytes long. - optional int32 max_window_utf32_length = 3; + optional int32 max_window_bytes = 3; } optional SnippetSpecProto snippet_spec = 3; @@ -136,57 +136,27 @@ message ResultSpecProto { } // The representation of a single match within a DocumentProto property. -// -// Example : A document whose content is "Necesito comprar comida mañana." and a -// query for "mana" with window=15 -// Next tag: 12 +// Next tag: 10 message SnippetMatchProto { // The index of the byte in the string at which the match begins and the // length in bytes of the match. - // - // For the example above, the values of these fields would be - // exact_match_byte_position=24, exact_match_byte_length=7 "mañana" optional int32 exact_match_byte_position = 2; optional int32 exact_match_byte_length = 3; - // The length in bytes of the subterm that matches the query. The beginning of - // the submatch is the same as exact_match_byte_position. - // - // For the example above, the value of this field would be 5. With - // exact_match_byte_position=24 above, it would produce the substring "maña" - optional int32 submatch_byte_length = 10; - // The index of the UTF-16 code unit in the string at which the match begins // and the length in UTF-16 code units of the match. This is for use with // UTF-16 encoded strings like Java.lang.String. - // - // For the example above, the values of these fields would be - // exact_match_utf16_position=24, exact_match_utf16_length=6 "mañana" optional int32 exact_match_utf16_position = 6; optional int32 exact_match_utf16_length = 7; - // The length in UTF-16 code units of the subterm that matches the query. The - // beginning of the submatch is the same as exact_match_utf16_position. This - // is for use with UTF-16 encoded strings like Java.lang.String. - // - // For the example above, the value of this field would be 4. With - // exact_match_utf16_position=24 above, it would produce the substring "maña" - optional int32 submatch_utf16_length = 11; - // The index of the byte in the string at which the suggested snippet window // begins and the length in bytes of the window. - // - // For the example above, the values of these fields would be - // window_byte_position=17, window_byte_length=15 "comida mañana." optional int32 window_byte_position = 4; optional int32 window_byte_length = 5; // The index of the UTF-16 code unit in the string at which the suggested // snippet window begins and the length in UTF-16 code units of the window. // This is for use with UTF-16 encoded strings like Java.lang.String. - // - // For the example above, the values of these fields would be - // window_utf16_position=17, window_utf16_length=14 "comida mañana." optional int32 window_utf16_position = 8; optional int32 window_utf16_length = 9; @@ -308,54 +278,3 @@ message GetResultSpecProto { // type will be retrieved. repeated TypePropertyMask type_property_masks = 1; } - -// Next tag: 5 -message SuggestionSpecProto { - // REQUIRED: The "raw" prefix string that users may type. For example, "f" - // will search for suggested query that start with "f" like "foo", "fool". - optional string prefix = 1; - - // OPTIONAL: Only search for suggestions that under the specified namespaces. - // If unset, the suggestion will search over all namespaces. Note that this - // applies to the entire 'prefix'. To issue different suggestions for - // different namespaces, separate RunSuggestion()'s will need to be made. - repeated string namespace_filters = 2; - - // REQUIRED: The number of suggestions to be returned. - optional int32 num_to_return = 3; - - // Indicates how the suggestion terms should be scored and ranked. - message SuggestionScoringSpecProto { - // TermMatchType.Code=UNKNOWN - // Should never purposely be set and may lead to undefined behavior. This is - // used for backwards compatibility reasons. - // - // TermMatchType.Code=EXACT_ONLY - // Only exact hits will be counted to score a suggestion term. - // - // TermMatchType.Code=PREFIX - // Both exact hits and prefix hits will be counted to score a suggestion - // term. - optional TermMatchType.Code scoring_match_type = 1; - } - - optional SuggestionScoringSpecProto scoring_spec = 4; -} - -// Next tag: 3 -message SuggestionResponse { - message Suggestion { - // The suggested query string for client to search for. - optional string query = 1; - } - - // Status code can be one of: - // OK - // FAILED_PRECONDITION - // INTERNAL - // - // See status.proto for more details. - optional StatusProto status = 1; - - repeated Suggestion suggestions = 2; -} diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt index 73d349b..35ad6d9 100644 --- a/synced_AOSP_CL_number.txt +++ b/synced_AOSP_CL_number.txt @@ -1 +1 @@ -set(synced_AOSP_CL_number=436284873) +set(synced_AOSP_CL_number=378695940) |