diff options
author | Terry Wang <tytytyww@google.com> | 2023-03-03 10:43:50 -0800 |
---|---|---|
committer | Terry Wang <tytytyww@google.com> | 2023-03-03 10:45:16 -0800 |
commit | 49064c458678781fbf3db256751658728dc87740 (patch) | |
tree | 76f82b7a029bb630ee916e1eb9cb14afcdc8f84d | |
parent | e103b8ea56212b2a5abc082ce888843f19c7d567 (diff) | |
download | icing-49064c458678781fbf3db256751658728dc87740.tar.gz |
Update Icing from upstream.
Descriptions:
======================================================================
Modify the lexer to consider internal minus characters to be a part of TEXT segments (as hyphens).
======================================================================
Modify the query language to support an individual prefix operator '*'.
======================================================================
Fix the fingerprint issue for document key mapper in document store
======================================================================
Resolve issues around the handling of negative numbers.
======================================================================
Fix doc join info compiler warning/error
======================================================================
Codifies how colons are tokenized by the lexer
Bug: 208654892
Bug: 268680462
Change-Id: I67169b2e656eeafb6500b5b03a6c46c3cc9c3d66
19 files changed, 888 insertions, 272 deletions
diff --git a/icing/icing-search-engine_schema_test.cc b/icing/icing-search-engine_schema_test.cc index 59d25e5..38a0464 100644 --- a/icing/icing-search-engine_schema_test.cc +++ b/icing/icing-search-engine_schema_test.cc @@ -1987,6 +1987,8 @@ TEST_F(IcingSearchEngineSchemaTest, IcingShouldWorkFor64Sections) { ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); ASSERT_THAT(icing.Put(email_collection).status(), ProtoIsOk()); + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::PREFIX); const std::vector<std::string> query_terms = { "first1", "last2", "email3@gmail.com", "000-000-001", "body", "subject", "2022-08-02", "3\\:00"}; @@ -1995,8 +1997,6 @@ TEST_F(IcingSearchEngineSchemaTest, IcingShouldWorkFor64Sections) { *expected_document.mutable_results()->Add()->mutable_document() = email_collection; for (const std::string& query_term : query_terms) { - SearchSpecProto search_spec; - search_spec.set_term_match_type(TermMatchType::PREFIX); search_spec.set_query(query_term); SearchResultProto actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), @@ -2005,8 +2005,6 @@ TEST_F(IcingSearchEngineSchemaTest, IcingShouldWorkFor64Sections) { EqualsSearchResultIgnoreStatsAndScores(expected_document)); } - SearchSpecProto search_spec; - search_spec.set_term_match_type(TermMatchType::PREFIX); search_spec.set_query("foo"); SearchResultProto expected_no_documents; expected_no_documents.mutable_status()->set_code(StatusProto::OK); diff --git a/icing/icing-search-engine_search_test.cc b/icing/icing-search-engine_search_test.cc index 3c32253..5648184 100644 --- a/icing/icing-search-engine_search_test.cc +++ b/icing/icing-search-engine_search_test.cc @@ -3002,12 +3002,6 @@ TEST_P(IcingSearchEngineSearchTest, SnippetSectionRestrict) { } TEST_P(IcingSearchEngineSearchTest, Hyphens) { - // TODO(b/208654892): Fix issues with minus/hyphen chars. - if (GetParam() == - SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY) { - GTEST_SKIP() - << "Advanced query doesn't properly support hyphens at this time."; - } IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); diff --git a/icing/join/doc-join-info.cc b/icing/join/doc-join-info.cc index 9bef08a..3b06f01 100644 --- a/icing/join/doc-join-info.cc +++ b/icing/join/doc-join-info.cc @@ -25,13 +25,14 @@ namespace lib { DocJoinInfo::DocJoinInfo(DocumentId document_id, JoinablePropertyId joinable_property_id) { - value_ = 0; + Value temp_value = 0; bit_util::BitfieldSet(/*new_value=*/document_id, /*lsb_offset=*/kJoinablePropertyIdBits, - /*len=*/kDocumentIdBits, &value_); + /*len=*/kDocumentIdBits, &temp_value); bit_util::BitfieldSet(/*new_value=*/joinable_property_id, /*lsb_offset=*/0, - /*len=*/kJoinablePropertyIdBits, &value_); + /*len=*/kJoinablePropertyIdBits, &temp_value); + value_ = temp_value; } DocumentId DocJoinInfo::document_id() const { diff --git a/icing/query/advanced_query_parser/abstract-syntax-tree.h b/icing/query/advanced_query_parser/abstract-syntax-tree.h index dc28ab6..d18f6ea 100644 --- a/icing/query/advanced_query_parser/abstract-syntax-tree.h +++ b/icing/query/advanced_query_parser/abstract-syntax-tree.h @@ -52,18 +52,24 @@ class Node { class TerminalNode : public Node { public: - explicit TerminalNode(std::string value) : value_(std::move(value)) {} + explicit TerminalNode(std::string value, bool is_prefix) + : value_(std::move(value)), is_prefix_(is_prefix) {} - const std::string& value() const { return value_; } + const std::string& value() const& { return value_; } + std::string value() && { return std::move(value_); } + + bool is_prefix() const { return is_prefix_; } private: std::string value_; + bool is_prefix_; }; class FunctionNameNode : public TerminalNode { public: explicit FunctionNameNode(std::string value) - : TerminalNode(std::move(value)) {} + : TerminalNode(std::move(value), /*is_prefix=*/false) {} + void Accept(AbstractSyntaxTreeVisitor* visitor) const override { visitor->VisitFunctionName(this); } @@ -71,7 +77,9 @@ class FunctionNameNode : public TerminalNode { class StringNode : public TerminalNode { public: - explicit StringNode(std::string value) : TerminalNode(std::move(value)) {} + explicit StringNode(std::string value, bool is_prefix = false) + : TerminalNode(std::move(value), is_prefix) {} + void Accept(AbstractSyntaxTreeVisitor* visitor) const override { visitor->VisitString(this); } @@ -79,7 +87,9 @@ class StringNode : public TerminalNode { class TextNode : public TerminalNode { public: - explicit TextNode(std::string value) : TerminalNode(std::move(value)) {} + explicit TextNode(std::string value, bool is_prefix = false) + : TerminalNode(std::move(value), is_prefix) {} + void Accept(AbstractSyntaxTreeVisitor* visitor) const override { visitor->VisitText(this); } diff --git a/icing/query/advanced_query_parser/function_test.cc b/icing/query/advanced_query_parser/function_test.cc index f9aaed6..3b3ca40 100644 --- a/icing/query/advanced_query_parser/function_test.cc +++ b/icing/query/advanced_query_parser/function_test.cc @@ -65,7 +65,8 @@ TEST(FunctionTest, ParamNotWrongTypeFails) { /*params=*/{Param(DataType::kString)}, TrivialEval())); // foo(bar) std::vector<PendingValue> args; - args.push_back(PendingValue::CreateTextPendingValue("bar")); + args.push_back(PendingValue::CreateTextPendingValue( + QueryTerm{"bar", /*is_prefix_val=*/false})); EXPECT_THAT(function.Eval(std::move(args)), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } @@ -78,7 +79,8 @@ TEST(FunctionTest, ParamRequiredArgSucceeds) { // foo("bar") std::vector<PendingValue> args; - args.push_back(PendingValue::CreateStringPendingValue("bar")); + args.push_back(PendingValue::CreateStringPendingValue( + QueryTerm{"bar", /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(PendingValue val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); } @@ -136,14 +138,17 @@ TEST(FunctionTest, MultipleArgsTrailingOptionalSucceeds) { // foo("bar") std::vector<PendingValue> args; - args.push_back(PendingValue::CreateStringPendingValue("bar")); + args.push_back(PendingValue::CreateStringPendingValue( + QueryTerm{"bar", /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(PendingValue val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); // foo("bar", "baz") args = std::vector<PendingValue>(); - args.push_back(PendingValue::CreateStringPendingValue("bar")); - args.push_back(PendingValue::CreateStringPendingValue("baz")); + args.push_back(PendingValue::CreateStringPendingValue( + QueryTerm{"bar", /*is_prefix_val=*/false})); + args.push_back(PendingValue::CreateStringPendingValue( + QueryTerm{"baz", /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); } @@ -159,22 +164,28 @@ TEST(FunctionTest, MultipleArgsTrailingVariableSucceeds) { // foo("bar") std::vector<PendingValue> args; - args.push_back(PendingValue::CreateStringPendingValue("bar")); + args.push_back(PendingValue::CreateStringPendingValue( + QueryTerm{"bar", /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(PendingValue val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); // foo("bar", "baz") args = std::vector<PendingValue>(); - args.push_back(PendingValue::CreateStringPendingValue("bar")); - args.push_back(PendingValue::CreateStringPendingValue("baz")); + args.push_back(PendingValue::CreateStringPendingValue( + QueryTerm{"bar", /*is_prefix_val=*/false})); + args.push_back(PendingValue::CreateStringPendingValue( + QueryTerm{"baz", /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); // foo("bar", "baz", "bat") args = std::vector<PendingValue>(); - args.push_back(PendingValue::CreateStringPendingValue("bar")); - args.push_back(PendingValue::CreateStringPendingValue("baz")); - args.push_back(PendingValue::CreateStringPendingValue("bat")); + args.push_back(PendingValue::CreateStringPendingValue( + QueryTerm{"bar", /*is_prefix_val=*/false})); + args.push_back(PendingValue::CreateStringPendingValue( + QueryTerm{"baz", /*is_prefix_val=*/false})); + args.push_back(PendingValue::CreateStringPendingValue( + QueryTerm{"bat", /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); } @@ -205,20 +216,24 @@ TEST(FunctionTest, MultipleArgsOptionalBeforeOptionalSucceeds) { // foo("bar") args = std::vector<PendingValue>(); - args.push_back(PendingValue::CreateStringPendingValue("bar")); + args.push_back(PendingValue::CreateStringPendingValue( + QueryTerm{"bar", /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); // foo("bar", baz) args = std::vector<PendingValue>(); - args.push_back(PendingValue::CreateStringPendingValue("bar")); - args.push_back(PendingValue::CreateTextPendingValue("baz")); + args.push_back(PendingValue::CreateStringPendingValue( + QueryTerm{"bar", /*is_prefix_val=*/false})); + args.push_back(PendingValue::CreateTextPendingValue( + QueryTerm{"baz", /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); // foo(baz) args = std::vector<PendingValue>(); - args.push_back(PendingValue::CreateTextPendingValue("baz")); + args.push_back(PendingValue::CreateTextPendingValue( + QueryTerm{"baz", /*is_prefix_val=*/false})); EXPECT_THAT(function.Eval(std::move(args)), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } @@ -239,35 +254,44 @@ TEST(FunctionTest, MultipleArgsOptionalBeforeVariableSucceeds) { // foo("bar") args = std::vector<PendingValue>(); - args.push_back(PendingValue::CreateStringPendingValue("bar")); + args.push_back(PendingValue::CreateStringPendingValue( + QueryTerm{"bar", /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); // foo("bar", baz) args = std::vector<PendingValue>(); - args.push_back(PendingValue::CreateStringPendingValue("bar")); - args.push_back(PendingValue::CreateTextPendingValue("baz")); + args.push_back(PendingValue::CreateStringPendingValue( + QueryTerm{"bar", /*is_prefix_val=*/false})); + args.push_back(PendingValue::CreateTextPendingValue( + QueryTerm{"baz", /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); // foo("bar", baz, bat) args = std::vector<PendingValue>(); - args.push_back(PendingValue::CreateStringPendingValue("bar")); - args.push_back(PendingValue::CreateTextPendingValue("baz")); - args.push_back(PendingValue::CreateTextPendingValue("bat")); + args.push_back(PendingValue::CreateStringPendingValue( + QueryTerm{"bar", /*is_prefix_val=*/false})); + args.push_back(PendingValue::CreateTextPendingValue( + QueryTerm{"baz", /*is_prefix_val=*/false})); + args.push_back(PendingValue::CreateTextPendingValue( + QueryTerm{"bat", /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); // foo(baz) args = std::vector<PendingValue>(); - args.push_back(PendingValue::CreateTextPendingValue("baz")); + args.push_back(PendingValue::CreateTextPendingValue( + QueryTerm{"baz", /*is_prefix_val=*/false})); EXPECT_THAT(function.Eval(std::move(args)), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); // foo(baz, bat) args = std::vector<PendingValue>(); - args.push_back(PendingValue::CreateTextPendingValue("baz")); - args.push_back(PendingValue::CreateTextPendingValue("bat")); + args.push_back(PendingValue::CreateTextPendingValue( + QueryTerm{"baz", /*is_prefix_val=*/false})); + args.push_back(PendingValue::CreateTextPendingValue( + QueryTerm{"bat", /*is_prefix_val=*/false})); EXPECT_THAT(function.Eval(std::move(args)), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } diff --git a/icing/query/advanced_query_parser/lexer.cc b/icing/query/advanced_query_parser/lexer.cc index 685f2e9..6cddd96 100644 --- a/icing/query/advanced_query_parser/lexer.cc +++ b/icing/query/advanced_query_parser/lexer.cc @@ -38,10 +38,24 @@ bool Lexer::ConsumeWhitespace() { } bool Lexer::ConsumeQuerySingleChar() { - if (current_char_ != ':') { - return false; + switch (current_char_) { + case ':': + tokens_.push_back({":", TokenType::COMPARATOR}); + break; + case '*': + tokens_.push_back({"", TokenType::STAR}); + break; + case '-': + if (in_text_) { + // MINUS ('-') is considered to be a part of a text segment if it is + // in the middle of a TEXT segment (ex. `foo-bar`). + return false; + } + tokens_.push_back({"", TokenType::MINUS}); + break; + default: + return false; } - tokens_.push_back({":", TokenType::COMPARATOR}); Advance(); return true; } @@ -57,6 +71,9 @@ bool Lexer::ConsumeScoringSingleChar() { case '/': tokens_.push_back({"", TokenType::DIV}); break; + case '-': + tokens_.push_back({"", TokenType::MINUS}); + break; default: return false; } @@ -72,9 +89,6 @@ bool Lexer::ConsumeGeneralSingleChar() { case '.': tokens_.push_back({"", TokenType::DOT}); break; - case '-': - tokens_.push_back({"", TokenType::MINUS}); - break; case '(': tokens_.push_back({"", TokenType::LPAREN}); break; @@ -176,6 +190,7 @@ bool Lexer::Text() { tokens_.push_back({"", TokenType::TEXT}); int token_index = tokens_.size() - 1; while (!ConsumeNonText() && current_char_ != '\0') { + in_text_ = true; // When getting a backslash in TEXT, unescape it by accepting its following // character no matter which character it is, including white spaces, // operator symbols, parentheses, etc. @@ -194,6 +209,8 @@ bool Lexer::Text() { // No need to break, since NonText() must be true at this point. } } + in_text_ = false; + if (language_ == Lexer::Language::QUERY) { std::string &text = tokens_[token_index].text; TokenType &type = tokens_[token_index].type; diff --git a/icing/query/advanced_query_parser/lexer.h b/icing/query/advanced_query_parser/lexer.h index a13ab60..f7f06dc 100644 --- a/icing/query/advanced_query_parser/lexer.h +++ b/icing/query/advanced_query_parser/lexer.h @@ -38,6 +38,7 @@ class Lexer { DOT, // '.' PLUS, // '+' Not allowed in QUERY language. MINUS, // '-' + STAR, // '*' Not allowed in SCORING language. TIMES, // '*' Not allowed in QUERY language. DIV, // '/' Not allowed in QUERY language. LPAREN, // '(' @@ -149,6 +150,10 @@ class Lexer { int32_t current_index_ = -1; char current_char_ = '\0'; std::vector<LexerToken> tokens_; + + // Stores whether the lexer is currently inspecting a TEXT segment while + // handling current_char_. + bool in_text_ = false; }; } // namespace lib diff --git a/icing/query/advanced_query_parser/lexer_test.cc b/icing/query/advanced_query_parser/lexer_test.cc index 86284fb..c6d215c 100644 --- a/icing/query/advanced_query_parser/lexer_test.cc +++ b/icing/query/advanced_query_parser/lexer_test.cc @@ -73,22 +73,26 @@ TEST(LexerTest, PrefixQuery) { ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> tokens, lexer->ExtractTokens()); EXPECT_THAT(tokens, - ElementsAre(EqualsLexerToken("foo*", Lexer::TokenType::TEXT))); + ElementsAre(EqualsLexerToken("foo", Lexer::TokenType::TEXT), + EqualsLexerToken("", Lexer::TokenType::STAR))); lexer = std::make_unique<Lexer>("fooAND*", Lexer::Language::QUERY); ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens()); EXPECT_THAT(tokens, - ElementsAre(EqualsLexerToken("fooAND*", Lexer::TokenType::TEXT))); + ElementsAre(EqualsLexerToken("fooAND", Lexer::TokenType::TEXT), + EqualsLexerToken("", Lexer::TokenType::STAR))); lexer = std::make_unique<Lexer>("*ORfoo", Lexer::Language::QUERY); ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens()); EXPECT_THAT(tokens, - ElementsAre(EqualsLexerToken("*ORfoo", Lexer::TokenType::TEXT))); + ElementsAre(EqualsLexerToken("", Lexer::TokenType::STAR), + EqualsLexerToken("ORfoo", Lexer::TokenType::TEXT))); lexer = std::make_unique<Lexer>("fooANDbar*", Lexer::Language::QUERY); ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens()); - EXPECT_THAT(tokens, ElementsAre(EqualsLexerToken("fooANDbar*", - Lexer::TokenType::TEXT))); + EXPECT_THAT(tokens, + ElementsAre(EqualsLexerToken("fooANDbar", Lexer::TokenType::TEXT), + EqualsLexerToken("", Lexer::TokenType::STAR))); } TEST(LexerTest, SimpleStringQuery) { @@ -296,7 +300,8 @@ TEST(LexerTest, ComplexQuery) { EqualsLexerToken("sender", Lexer::TokenType::TEXT), EqualsLexerToken(":", Lexer::TokenType::COMPARATOR), EqualsLexerToken(Lexer::TokenType::LPAREN), - EqualsLexerToken("foo*", Lexer::TokenType::TEXT), + EqualsLexerToken("foo", Lexer::TokenType::TEXT), + EqualsLexerToken("", Lexer::TokenType::STAR), EqualsLexerToken(Lexer::TokenType::AND), EqualsLexerToken("bar", Lexer::TokenType::TEXT), EqualsLexerToken(Lexer::TokenType::OR), @@ -376,14 +381,13 @@ TEST(LexerTest, CJKT) { lexer = std::make_unique<Lexer>("ញុំ&&ដើរទៅ||ធ្វើការ-រាល់ថ្ងៃ", Lexer::Language::QUERY); ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens()); - EXPECT_THAT(tokens, - ElementsAre(EqualsLexerToken("ញុំ", Lexer::TokenType::TEXT), - EqualsLexerToken(Lexer::TokenType::AND), - EqualsLexerToken("ដើរទៅ", Lexer::TokenType::TEXT), - EqualsLexerToken(Lexer::TokenType::OR), - EqualsLexerToken("ធ្វើការ", Lexer::TokenType::TEXT), - EqualsLexerToken(Lexer::TokenType::MINUS), - EqualsLexerToken("រាល់ថ្ងៃ", Lexer::TokenType::TEXT))); + EXPECT_THAT( + tokens, + ElementsAre(EqualsLexerToken("ញុំ", Lexer::TokenType::TEXT), + EqualsLexerToken(Lexer::TokenType::AND), + EqualsLexerToken("ដើរទៅ", Lexer::TokenType::TEXT), + EqualsLexerToken(Lexer::TokenType::OR), + EqualsLexerToken("ធ្វើការ-រាល់ថ្ងៃ", Lexer::TokenType::TEXT))); lexer = std::make_unique<Lexer>( "나는" @@ -477,7 +481,9 @@ TEST(LexerTest, ScoringArithmetic) { lexer = std::make_unique<Lexer>("1+2*3/4", Lexer::Language::QUERY); ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens()); EXPECT_THAT(tokens, - ElementsAre(EqualsLexerToken("1+2*3/4", Lexer::TokenType::TEXT))); + ElementsAre(EqualsLexerToken("1+2", Lexer::TokenType::TEXT), + EqualsLexerToken("", Lexer::TokenType::STAR), + EqualsLexerToken("3/4", Lexer::TokenType::TEXT))); } // Currently, in scoring language, the lexer will view these logic operators as @@ -609,6 +615,52 @@ TEST(LexerTest, ComplexScoring) { EqualsLexerToken(Lexer::TokenType::RPAREN))); } +// foo:bar:baz is considered an invalid query as proposed in +// http://go/appsearch-advanced-query-impl-plan#bookmark=id.yoeyepokmbc5 ; this +// ensures that the lexer consistently tokenizes colons independently. +TEST(LexerTest, NoAmbiguousTokenizing) { + // This is an invalid query; the lexer doesn't treat `bar:baz` as one token. + std::unique_ptr<Lexer> lexer = + std::make_unique<Lexer>("foo:bar:baz", Lexer::Language::QUERY); + ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> invalidQueryTokens, + lexer->ExtractTokens()); + EXPECT_THAT(invalidQueryTokens, + ElementsAre(EqualsLexerToken("foo", Lexer::TokenType::TEXT), + EqualsLexerToken(":", Lexer::TokenType::COMPARATOR), + EqualsLexerToken("bar", Lexer::TokenType::TEXT), + EqualsLexerToken(":", Lexer::TokenType::COMPARATOR), + EqualsLexerToken("baz", Lexer::TokenType::TEXT))); + + lexer = std::make_unique<Lexer>("foo:\"bar:baz\"", Lexer::Language::QUERY); + ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> validQueryTokens, + lexer->ExtractTokens()); + EXPECT_THAT( + validQueryTokens, + ElementsAre(EqualsLexerToken("foo", Lexer::TokenType::TEXT), + EqualsLexerToken(":", Lexer::TokenType::COMPARATOR), + EqualsLexerToken("bar:baz", Lexer::TokenType::STRING))); +} + +TEST(LexerTest, WhiteSpacesDoNotAffectColonTokenization) { + std::unique_ptr<Lexer> lexer = + std::make_unique<Lexer>("a:b c : d e: f g :h", Lexer::Language::QUERY); + ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> tokens, + lexer->ExtractTokens()); + EXPECT_THAT(tokens, + ElementsAre(EqualsLexerToken("a", Lexer::TokenType::TEXT), + EqualsLexerToken(":", Lexer::TokenType::COMPARATOR), + EqualsLexerToken("b", Lexer::TokenType::TEXT), + EqualsLexerToken("c", Lexer::TokenType::TEXT), + EqualsLexerToken(":", Lexer::TokenType::COMPARATOR), + EqualsLexerToken("d", Lexer::TokenType::TEXT), + EqualsLexerToken("e", Lexer::TokenType::TEXT), + EqualsLexerToken(":", Lexer::TokenType::COMPARATOR), + EqualsLexerToken("f", Lexer::TokenType::TEXT), + EqualsLexerToken("g", Lexer::TokenType::TEXT), + EqualsLexerToken(":", Lexer::TokenType::COMPARATOR), + EqualsLexerToken("h", Lexer::TokenType::TEXT))); +} + TEST(LexerTest, QueryShouldRejectTokensBeyondLimit) { std::string query; for (int i = 0; i < Lexer::kMaxNumTokens + 1; ++i) { diff --git a/icing/query/advanced_query_parser/parser.cc b/icing/query/advanced_query_parser/parser.cc index 086f038..0e4c78d 100644 --- a/icing/query/advanced_query_parser/parser.cc +++ b/icing/query/advanced_query_parser/parser.cc @@ -72,15 +72,24 @@ Parser::ConsumeFunctionName() { return function_name_node; } +// stringElement +// : STRING STAR? libtextclassifier3::StatusOr<std::unique_ptr<StringNode>> -Parser::ConsumeString() { +Parser::ConsumeStringElement() { if (!Match(Lexer::TokenType::STRING)) { return absl_ports::InvalidArgumentError( "Unable to consume token as STRING."); } - auto node = std::make_unique<StringNode>(std::move(current_token_->text)); + std::string text = std::move(current_token_->text); ++current_token_; - return node; + + bool is_prefix = false; + if (Match(Lexer::TokenType::STAR)) { + is_prefix = true; + ++current_token_; + } + + return std::make_unique<StringNode>(std::move(text), is_prefix); } libtextclassifier3::StatusOr<std::string> Parser::ConsumeComparator() { @@ -95,25 +104,35 @@ libtextclassifier3::StatusOr<std::string> Parser::ConsumeComparator() { // member // : TEXT (DOT TEXT)* (DOT function)? +// | TEXT STAR // ; libtextclassifier3::StatusOr<std::unique_ptr<MemberNode>> Parser::ConsumeMember() { ICING_ASSIGN_OR_RETURN(std::unique_ptr<TextNode> text_node, ConsumeText()); std::vector<std::unique_ptr<TextNode>> children; - children.push_back(std::move(text_node)); - - while (Match(Lexer::TokenType::DOT)) { - Consume(Lexer::TokenType::DOT); - if (MatchFunction()) { - ICING_ASSIGN_OR_RETURN(std::unique_ptr<FunctionNode> function_node, - ConsumeFunction()); - // Once a function is matched, we should exit the current rule based on - // the grammar. - return std::make_unique<MemberNode>(std::move(children), - std::move(function_node)); - } - ICING_ASSIGN_OR_RETURN(text_node, ConsumeText()); + + // Member could be either `TEXT (DOT TEXT)* (DOT function)?` or `TEXT STAR` + // at this point. So check for 'STAR' to differentiate the two cases. + if (Match(Lexer::TokenType::STAR)) { + Consume(Lexer::TokenType::STAR); + text_node = std::make_unique<TextNode>(std::move(*text_node).value(), + /*is_prefix=*/true); + children.push_back(std::move(text_node)); + } else { children.push_back(std::move(text_node)); + while (Match(Lexer::TokenType::DOT)) { + Consume(Lexer::TokenType::DOT); + if (MatchFunction()) { + ICING_ASSIGN_OR_RETURN(std::unique_ptr<FunctionNode> function_node, + ConsumeFunction()); + // Once a function is matched, we should exit the current rule based on + // the grammar. + return std::make_unique<MemberNode>(std::move(children), + std::move(function_node)); + } + ICING_ASSIGN_OR_RETURN(text_node, ConsumeText()); + children.push_back(std::move(text_node)); + } } return std::make_unique<MemberNode>(std::move(children), /*function=*/nullptr); @@ -141,14 +160,14 @@ Parser::ConsumeFunction() { } // comparable -// : STRING +// : stringElement // | member // | function // ; libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeComparable() { if (Match(Lexer::TokenType::STRING)) { - return ConsumeString(); + return ConsumeStringElement(); } else if (MatchMember()) { return ConsumeMember(); } @@ -186,7 +205,7 @@ Parser::ConsumeArgs() { } // restriction -// : comparable (COMPARATOR (comparable | composite))? +// : comparable (COMPARATOR MINUS? (comparable | composite))? // ; // COMPARATOR will not be produced in Scoring Lexer. libtextclassifier3::StatusOr<std::unique_ptr<Node>> @@ -197,6 +216,12 @@ Parser::ConsumeRestriction() { return comparable; } ICING_ASSIGN_OR_RETURN(std::string operator_text, ConsumeComparator()); + + bool has_minus = Match(Lexer::TokenType::MINUS); + if (has_minus) { + Consume(Lexer::TokenType::MINUS); + } + std::unique_ptr<Node> arg; if (MatchComposite()) { ICING_ASSIGN_OR_RETURN(arg, ConsumeComposite()); @@ -206,6 +231,11 @@ Parser::ConsumeRestriction() { return absl_ports::InvalidArgumentError( "ARG: must begin with LPAREN or FIRST(comparable)"); } + + if (has_minus) { + arg = std::make_unique<UnaryOperatorNode>("MINUS", std::move(arg)); + } + std::vector<std::unique_ptr<Node>> args; args.push_back(std::move(comparable)); args.push_back(std::move(arg)); @@ -243,10 +273,11 @@ libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeTerm() { } else { if (Match(Lexer::TokenType::NOT)) { Consume(Lexer::TokenType::NOT); + operator_text = "NOT"; } else { Consume(Lexer::TokenType::MINUS); + operator_text = "MINUS"; } - operator_text = "NOT"; } ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> simple, ConsumeSimple()); return std::make_unique<UnaryOperatorNode>(operator_text, std::move(simple)); diff --git a/icing/query/advanced_query_parser/parser.h b/icing/query/advanced_query_parser/parser.h index 330b8b9..a48c562 100644 --- a/icing/query/advanced_query_parser/parser.h +++ b/icing/query/advanced_query_parser/parser.h @@ -94,7 +94,8 @@ class Parser { libtextclassifier3::StatusOr<std::unique_ptr<FunctionNameNode>> ConsumeFunctionName(); - libtextclassifier3::StatusOr<std::unique_ptr<StringNode>> ConsumeString(); + libtextclassifier3::StatusOr<std::unique_ptr<StringNode>> + ConsumeStringElement(); libtextclassifier3::StatusOr<std::string> ConsumeComparator(); diff --git a/icing/query/advanced_query_parser/parser_integration_test.cc b/icing/query/advanced_query_parser/parser_integration_test.cc index 88cc079..fa1bd2e 100644 --- a/icing/query/advanced_query_parser/parser_integration_test.cc +++ b/icing/query/advanced_query_parser/parser_integration_test.cc @@ -189,7 +189,7 @@ TEST(ParserIntegrationTest, Minus) { parser.ConsumeQuery()); // Expected AST: - // NOT + // MINUS // | // member // | @@ -197,11 +197,11 @@ TEST(ParserIntegrationTest, Minus) { SimpleVisitor visitor; tree_root->Accept(&visitor); // SimpleVisitor ordering - // { text, member, NOT } + // { text, member, MINUS } EXPECT_THAT(visitor.nodes(), ElementsAre(EqualsNodeInfo("foo", NodeType::kText), EqualsNodeInfo("", NodeType::kMember), - EqualsNodeInfo("NOT", NodeType::kUnaryOperator))); + EqualsNodeInfo("MINUS", NodeType::kUnaryOperator))); } TEST(ParserIntegrationTest, Has) { diff --git a/icing/query/advanced_query_parser/parser_test.cc b/icing/query/advanced_query_parser/parser_test.cc index f997329..502dbd3 100644 --- a/icing/query/advanced_query_parser/parser_test.cc +++ b/icing/query/advanced_query_parser/parser_test.cc @@ -181,7 +181,7 @@ TEST(ParserTest, Minus) { parser.ConsumeQuery()); // Expected AST: - // NOT + // MINUS // | // member // | @@ -189,11 +189,11 @@ TEST(ParserTest, Minus) { SimpleVisitor visitor; tree_root->Accept(&visitor); // SimpleVisitor ordering - // { text, member, NOT } + // { text, member, MINUS } EXPECT_THAT(visitor.nodes(), ElementsAre(EqualsNodeInfo("foo", NodeType::kText), EqualsNodeInfo("", NodeType::kMember), - EqualsNodeInfo("NOT", NodeType::kUnaryOperator))); + EqualsNodeInfo("MINUS", NodeType::kUnaryOperator))); } TEST(ParserTest, Has) { diff --git a/icing/query/advanced_query_parser/pending-value.cc b/icing/query/advanced_query_parser/pending-value.cc index 306812d..903e12f 100644 --- a/icing/query/advanced_query_parser/pending-value.cc +++ b/icing/query/advanced_query_parser/pending-value.cc @@ -24,14 +24,19 @@ libtextclassifier3::Status PendingValue::ParseInt() { } else if (data_type_ != DataType::kText) { return absl_ports::InvalidArgumentError("Cannot parse value as LONG"); } + if (query_term_.is_prefix_val) { + return absl_ports::InvalidArgumentError(absl_ports::StrCat( + "Cannot use prefix operator '*' with numeric value: ", + query_term_.term)); + } char* value_end; - long_val_ = std::strtoll(string_vals_.at(0).c_str(), &value_end, /*base=*/10); - if (value_end != string_vals_.at(0).c_str() + string_vals_.at(0).length()) { + long_val_ = std::strtoll(query_term_.term.c_str(), &value_end, /*base=*/10); + if (value_end != query_term_.term.c_str() + query_term_.term.length()) { return absl_ports::InvalidArgumentError(absl_ports::StrCat( - "Unable to parse \"", string_vals_.at(0), "\" as number.")); + "Unable to parse \"", query_term_.term, "\" as number.")); } data_type_ = DataType::kLong; - string_vals_.clear(); + query_term_ = {"", false}; return libtextclassifier3::Status::OK; } diff --git a/icing/query/advanced_query_parser/pending-value.h b/icing/query/advanced_query_parser/pending-value.h index 8a8704d..d18789d 100644 --- a/icing/query/advanced_query_parser/pending-value.h +++ b/icing/query/advanced_query_parser/pending-value.h @@ -36,14 +36,19 @@ enum class DataType { kDocumentIterator, }; +struct QueryTerm { + std::string term; + bool is_prefix_val; +}; + // A holder for intermediate results when processing child nodes. struct PendingValue { - static PendingValue CreateStringPendingValue(std::string str) { - return PendingValue(std::move(str), DataType::kString); + static PendingValue CreateStringPendingValue(QueryTerm query_term) { + return PendingValue(std::move(query_term), DataType::kString); } - static PendingValue CreateTextPendingValue(std::string text) { - return PendingValue(std::move(text), DataType::kText); + static PendingValue CreateTextPendingValue(QueryTerm query_term) { + return PendingValue(std::move(query_term), DataType::kText); } PendingValue() : data_type_(DataType::kNone) {} @@ -82,22 +87,22 @@ struct PendingValue { return std::move(string_vals_); } - libtextclassifier3::StatusOr<const std::string*> string_val() const& { + libtextclassifier3::StatusOr<const QueryTerm*> string_val() const& { ICING_RETURN_IF_ERROR(CheckDataType(DataType::kString)); - return &string_vals_.at(0); + return &query_term_; } - libtextclassifier3::StatusOr<std::string> string_val() && { + libtextclassifier3::StatusOr<QueryTerm> string_val() && { ICING_RETURN_IF_ERROR(CheckDataType(DataType::kString)); - return std::move(string_vals_.at(0)); + return std::move(query_term_); } - libtextclassifier3::StatusOr<const std::string*> text_val() const& { + libtextclassifier3::StatusOr<const QueryTerm*> text_val() const& { ICING_RETURN_IF_ERROR(CheckDataType(DataType::kText)); - return &string_vals_.at(0); + return &query_term_; } - libtextclassifier3::StatusOr<std::string> text_val() && { + libtextclassifier3::StatusOr<QueryTerm> text_val() && { ICING_RETURN_IF_ERROR(CheckDataType(DataType::kText)); - return std::move(string_vals_.at(0)); + return std::move(query_term_); } libtextclassifier3::StatusOr<int64_t> long_val() { @@ -119,8 +124,8 @@ struct PendingValue { DataType data_type() const { return data_type_; } private: - explicit PendingValue(std::string str, DataType data_type) - : string_vals_({std::move(str)}), data_type_(data_type) {} + explicit PendingValue(QueryTerm query_term, DataType data_type) + : query_term_({std::move(query_term)}), data_type_(data_type) {} libtextclassifier3::Status CheckDataType(DataType required_data_type) const { if (data_type_ == required_data_type) { @@ -136,10 +141,12 @@ struct PendingValue { // iterator_ will be populated when data_type_ is kDocumentIterator. std::unique_ptr<DocHitInfoIterator> iterator_; - // string_vals_ will be populated when data_type_ is kString, kText and - // kStringList. + // string_vals_ will be populated when data_type_ is kStringList. std::vector<std::string> string_vals_; + // query_term_ will be populated when data_type_ is kString or kText + QueryTerm query_term_; + // long_val_ will be populated when data_type_ is kLong - after a successful // call to ParseInt. int64_t long_val_; diff --git a/icing/query/advanced_query_parser/query-visitor.cc b/icing/query/advanced_query_parser/query-visitor.cc index 71e1c7c..9df1264 100644 --- a/icing/query/advanced_query_parser/query-visitor.cc +++ b/icing/query/advanced_query_parser/query-visitor.cc @@ -54,8 +54,8 @@ struct CreateList { std::vector<std::string> values; values.reserve(args.size()); for (PendingValue& arg : args) { - std::string val = std::move(arg).string_val().ValueOrDie(); - values.push_back(std::move(val)); + QueryTerm val = std::move(arg).string_val().ValueOrDie(); + values.push_back(std::move(val.term)); } return PendingValue(std::move(values)); } @@ -168,16 +168,17 @@ void QueryVisitor::PendingPropertyRestricts::AddValidRestricts( } libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> -QueryVisitor::CreateTermIterator(const std::string& term) { +QueryVisitor::CreateTermIterator(QueryTerm query_term) { + TermMatchType::Code match_type = GetTermMatchType(query_term.is_prefix_val); if (!processing_not_) { // 1. Add term to property_query_terms_map if (pending_property_restricts_.has_active_property_restricts()) { for (const std::string& property_restrict : pending_property_restricts_.active_property_restricts()) { - property_query_terms_map_[property_restrict].insert(term); + property_query_terms_map_[property_restrict].insert(query_term.term); } } else { - property_query_terms_map_[""].insert(term); + property_query_terms_map_[""].insert(query_term.term); } // 2. If needed add term iterator to query_term_iterators_ map. @@ -186,22 +187,22 @@ QueryVisitor::CreateTermIterator(const std::string& term) { // pass it into index.GetIterator ICING_ASSIGN_OR_RETURN( std::unique_ptr<DocHitInfoIterator> term_iterator, - index_.GetIterator(term, /*term_start_index=*/0, + index_.GetIterator(query_term.term, /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, - match_type_, needs_term_frequency_info_)); - query_term_iterators_[term] = std::make_unique<DocHitInfoIteratorFilter>( - std::move(term_iterator), &document_store_, &schema_store_, - filter_options_); + match_type, needs_term_frequency_info_)); + query_term_iterators_[query_term.term] = + std::make_unique<DocHitInfoIteratorFilter>( + std::move(term_iterator), &document_store_, &schema_store_, + filter_options_); } } // 3. Add the term iterator. - // TODO(b/208654892): Add support for the prefix operator (*). // TODO(b/152934343) Save "term start index" into Node and PendingValue and // pass it into index.GetIterator - return index_.GetIterator(term, /*term_start_index=*/0, + return index_.GetIterator(query_term.term, /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, - match_type_, needs_term_frequency_info_); + match_type, needs_term_frequency_info_); } void QueryVisitor::RegisterFunctions() { @@ -248,8 +249,8 @@ libtextclassifier3::StatusOr<PendingValue> QueryVisitor::SearchFunction( // The first arg is guaranteed to be a STRING at this point. It should be safe // to call ValueOrDie. - const std::string* query = args.at(0).string_val().ValueOrDie(); - Lexer lexer(*query, Lexer::Language::QUERY); + const QueryTerm* query = args.at(0).string_val().ValueOrDie(); + Lexer lexer(query->term, Lexer::Language::QUERY); ICING_ASSIGN_OR_RETURN(std::vector<Lexer::LexerToken> lexer_tokens, lexer.ExtractTokens()); @@ -307,22 +308,21 @@ libtextclassifier3::StatusOr<int64_t> QueryVisitor::PopPendingIntValue() { return int_value; } -libtextclassifier3::StatusOr<std::string> -QueryVisitor::PopPendingStringValue() { +libtextclassifier3::StatusOr<QueryTerm> QueryVisitor::PopPendingStringValue() { if (pending_values_.empty()) { return absl_ports::InvalidArgumentError("Unable to retrieve string value."); } - ICING_ASSIGN_OR_RETURN(std::string string_value, + ICING_ASSIGN_OR_RETURN(QueryTerm string_value, std::move(pending_values_.top()).string_val()); pending_values_.pop(); return string_value; } -libtextclassifier3::StatusOr<std::string> QueryVisitor::PopPendingTextValue() { +libtextclassifier3::StatusOr<QueryTerm> QueryVisitor::PopPendingTextValue() { if (pending_values_.empty()) { return absl_ports::InvalidArgumentError("Unable to retrieve text value."); } - ICING_ASSIGN_OR_RETURN(std::string text_value, + ICING_ASSIGN_OR_RETURN(QueryTerm text_value, std::move(pending_values_.top()).text_val()); pending_values_.pop(); return text_value; @@ -340,19 +340,33 @@ QueryVisitor::PopPendingIterator() { return iterator; } else if (pending_values_.top().data_type() == DataType::kString) { features_.insert(kVerbatimSearchFeature); - ICING_ASSIGN_OR_RETURN(std::string value, PopPendingStringValue()); - return CreateTermIterator(std::move(value)); + ICING_ASSIGN_OR_RETURN(QueryTerm string_value, PopPendingStringValue()); + return CreateTermIterator(std::move(string_value)); } else { - ICING_ASSIGN_OR_RETURN(std::string value, PopPendingTextValue()); + ICING_ASSIGN_OR_RETURN(QueryTerm text_value, PopPendingTextValue()); ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> token_itr, - tokenizer_.Tokenize(value)); + tokenizer_.Tokenize(text_value.term)); std::string normalized_term; std::vector<std::unique_ptr<DocHitInfoIterator>> iterators; - while (token_itr->Advance()) { - for (const Token& token : token_itr->GetTokens()) { + // The tokenizer will produce 1+ tokens out of the text. The prefix operator + // only applies to the final token. + bool reached_final_token = !token_itr->Advance(); + while (!reached_final_token) { + std::vector<Token> tokens = token_itr->GetTokens(); + reached_final_token = !token_itr->Advance(); + + // The tokenizer iterator iterates between token groups. In practice, the + // tokenizer used with QueryVisitor (PlainTokenizer) will always only + // produce a single token per token group. + // For simplicity, we will apply the prefix operator to *all* tokens + // in the final token group. + for (const Token& token : tokens) { normalized_term = normalizer_.NormalizeTerm(token.text); - ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> iterator, - CreateTermIterator(std::move(normalized_term))); + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<DocHitInfoIterator> iterator, + CreateTermIterator( + QueryTerm{std::move(normalized_term), + reached_final_token && text_value.is_prefix_val})); iterators.push_back(std::move(iterator)); } } @@ -382,27 +396,65 @@ QueryVisitor::PopAllPendingIterators() { return iterators; } -libtextclassifier3::StatusOr<PendingValue> -QueryVisitor::ProcessNumericComparator(const NaryOperatorNode* node) { - // 1. The children should have been processed and added their outputs to - // pending_values_. Time to process them. - // The first two pending values should be the int value and the property. +libtextclassifier3::Status QueryVisitor::ProcessNumericComparator( + const NaryOperatorNode* node) { + if (node->children().size() != 2) { + return absl_ports::InvalidArgumentError("Expected 2 children."); + } + + // 1. Put in a placeholder PendingValue + pending_values_.push(PendingValue()); + + // 2. The first child is the property to restrict by. + node->children().at(0)->Accept(this); + if (has_pending_error()) { + return std::move(pending_error_); + } + ICING_ASSIGN_OR_RETURN(QueryTerm text_value, PopPendingTextValue()); + + if (text_value.is_prefix_val) { + return absl_ports::InvalidArgumentError( + "Cannot use prefix operator '*' with a property name!"); + } + + // If there is an active property restrict and this property is not present in + // in the active restrict set, then it's not satisfiable. + if (pending_property_restricts_.has_active_property_restricts() && + pending_property_restricts_.active_property_restricts().find( + text_value.term) == + pending_property_restricts_.active_property_restricts().end()) { + // The property restrict can't be satisfiable. Pop the placeholder that was + // just added and push a FALSE iterator. + pending_property_restricts_.PopRestricts(); + pending_values_.pop(); + pending_values_.push( + PendingValue(std::make_unique<DocHitInfoIteratorNone>())); + return libtextclassifier3::Status::OK; + } + + // 3. The second child should be parseable as an integer value. + expecting_numeric_arg_ = true; + node->children().at(1)->Accept(this); + expecting_numeric_arg_ = false; ICING_ASSIGN_OR_RETURN(int64_t int_value, PopPendingIntValue()); - ICING_ASSIGN_OR_RETURN(std::string property, PopPendingTextValue()); - // 2. Create the iterator. + // 4. Check for the placeholder. + if (!pending_values_.top().is_placeholder()) { + return absl_ports::InvalidArgumentError( + "Error processing arguments for node."); + } + pending_values_.pop(); + + // 5. Create the iterator and push it onto pending_values_. ICING_ASSIGN_OR_RETURN(Int64Range range, GetInt64Range(node->operator_text(), int_value)); - auto iterator_or = - numeric_index_.GetIterator(property, range.low, range.high); - if (!iterator_or.ok()) { - return std::move(iterator_or).status(); - } + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<DocHitInfoIterator> iterator, + numeric_index_.GetIterator(text_value.term, range.low, range.high)); features_.insert(kNumericSearchFeature); - std::unique_ptr<DocHitInfoIterator> iterator = - std::move(iterator_or).ValueOrDie(); - return PendingValue(std::move(iterator)); + pending_values_.push(PendingValue(std::move(iterator))); + return libtextclassifier3::Status::OK; } libtextclassifier3::StatusOr<PendingValue> QueryVisitor::ProcessAndOperator( @@ -421,6 +473,85 @@ libtextclassifier3::StatusOr<PendingValue> QueryVisitor::ProcessOrOperator( return PendingValue(CreateOrIterator(std::move(iterators))); } +libtextclassifier3::Status QueryVisitor::ProcessNegationOperator( + const UnaryOperatorNode* node) { + // 1. Put in a placeholder PendingValue + pending_values_.push(PendingValue()); + + // 2. Visit child + node->child()->Accept(this); + if (has_pending_error()) { + return std::move(pending_error_); + } + + if (pending_values_.size() < 2) { + return absl_ports::InvalidArgumentError( + "Visit unary operator child didn't correctly add pending values."); + } + + // 3. We want to preserve the original text of the integer value, append our + // minus and *then* parse as an int. + ICING_ASSIGN_OR_RETURN(QueryTerm int_text_val, PopPendingTextValue()); + int_text_val.term = absl_ports::StrCat("-", int_text_val.term); + PendingValue pending_value = + PendingValue::CreateTextPendingValue(std::move(int_text_val)); + ICING_RETURN_IF_ERROR(pending_value.long_val()); + + // We've parsed our integer value successfully. Pop our placeholder, push it + // on to the stack and return successfully. + if (!pending_values_.top().is_placeholder()) { + return absl_ports::InvalidArgumentError( + "Error processing arguments for node."); + } + pending_values_.pop(); + pending_values_.push(std::move(pending_value)); + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::Status QueryVisitor::ProcessNotOperator( + const UnaryOperatorNode* node) { + // TODO(b/265312785) Consider implementing query optimization when we run into + // nested NOTs. This would allow us to simplify a query like "NOT (-foo)" to + // just "foo". This would also require more complicate rewrites as we would + // need to do things like rewrite "NOT (-a OR b)" as "a AND -b" and + // "NOT (price < 5)" as "price >= 5". + // 1. Put in a placeholder PendingValue + pending_values_.push(PendingValue()); + // Toggle whatever the current value of 'processing_not_' is before visiting + // the children. + processing_not_ = !processing_not_; + + // 2. Visit child + node->child()->Accept(this); + if (has_pending_error()) { + return std::move(pending_error_); + } + + if (pending_values_.size() < 2) { + return absl_ports::InvalidArgumentError( + "Visit unary operator child didn't correctly add pending values."); + } + + // 3. Retrieve the delegate iterator + ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> delegate, + PopPendingIterator()); + + // 4. Check for the placeholder. + if (!pending_values_.top().is_placeholder()) { + return absl_ports::InvalidArgumentError( + "Error processing arguments for node."); + } + pending_values_.pop(); + + pending_values_.push(PendingValue(std::make_unique<DocHitInfoIteratorNot>( + std::move(delegate), document_store_.last_added_document_id()))); + + // Untoggle whatever the current value of 'processing_not_' is now that we've + // finished processing this NOT. + processing_not_ = !processing_not_; + return libtextclassifier3::Status::OK; +} + libtextclassifier3::Status QueryVisitor::ProcessHasOperator( const NaryOperatorNode* node) { if (node->children().size() != 2) { @@ -435,8 +566,12 @@ libtextclassifier3::Status QueryVisitor::ProcessHasOperator( if (has_pending_error()) { return pending_error_; } - ICING_ASSIGN_OR_RETURN(std::string property, PopPendingTextValue()); - pending_property_restricts_.AddValidRestricts({property}); + ICING_ASSIGN_OR_RETURN(QueryTerm text_value, PopPendingTextValue()); + if (text_value.is_prefix_val) { + return absl_ports::InvalidArgumentError( + "Cannot use prefix operator '*' with a property name!"); + } + pending_property_restricts_.AddValidRestricts({text_value.term}); // Just added a restrict - if there are no active property restricts then that // be because this restrict is unsatisfiable. @@ -466,7 +601,7 @@ libtextclassifier3::Status QueryVisitor::ProcessHasOperator( pending_values_.pop(); pending_property_restricts_.PopRestricts(); - std::set<std::string> property_restricts = {std::move(property)}; + std::set<std::string> property_restricts = {std::move(text_value.term)}; pending_values_.push( PendingValue(std::make_unique<DocHitInfoIteratorSectionRestrict>( std::move(delegate), &document_store_, &schema_store_, @@ -487,16 +622,16 @@ void QueryVisitor::VisitString(const StringNode* node) { return; } std::string unescaped_string = std::move(unescaped_string_or).ValueOrDie(); - pending_values_.push( - PendingValue::CreateStringPendingValue(std::move(unescaped_string))); + pending_values_.push(PendingValue::CreateStringPendingValue( + QueryTerm{std::move(unescaped_string), node->is_prefix()})); } void QueryVisitor::VisitText(const TextNode* node) { // TEXT nodes could either be a term (and will become DocHitInfoIteratorTerm) // or a property name. As such, we just push the TEXT value into pending // values and determine which it is at a later point. - pending_values_.push( - PendingValue::CreateTextPendingValue(std::move(node->value()))); + pending_values_.push(PendingValue::CreateTextPendingValue( + QueryTerm{std::move(node->value()), node->is_prefix()})); } void QueryVisitor::VisitMember(const MemberNode* node) { @@ -517,21 +652,41 @@ void QueryVisitor::VisitMember(const MemberNode* node) { } } - // 3. The children should have been processed and added their outputs to - // pending_values_. Time to process them. - libtextclassifier3::StatusOr<std::string> member_or; - std::vector<std::string> members; - while (!pending_values_.empty() && !pending_values_.top().is_placeholder()) { - member_or = PopPendingTextValue(); - if (!member_or.ok()) { - pending_error_ = std::move(member_or).status(); - return; + // 3. Now process the results of the children and produce a single pending + // value representing this member. + PendingValue pending_value; + if (node->children().size() == 1) { + // 3a. This member only has a single child, then the pending value produced + // by that child is the final value produced by this member. + pending_value = std::move(pending_values_.top()); + pending_values_.pop(); + } else { + // 3b. Retrieve the values of all children and concatenate them into a + // single value. + libtextclassifier3::StatusOr<QueryTerm> member_or; + std::vector<std::string> members; + QueryTerm text_val; + while (!pending_values_.empty() && + !pending_values_.top().is_placeholder()) { + member_or = PopPendingTextValue(); + if (!member_or.ok()) { + pending_error_ = std::move(member_or).status(); + return; + } + text_val = std::move(member_or).ValueOrDie(); + if (text_val.is_prefix_val) { + pending_error_ = absl_ports::InvalidArgumentError( + "Cannot use prefix operator '*' within a property name!"); + return; + } + members.push_back(std::move(text_val.term)); } - members.push_back(std::move(member_or).ValueOrDie()); + QueryTerm member; + member.term = absl_ports::StrJoin(members.rbegin(), members.rend(), + property_util::kPropertyPathSeparator); + member.is_prefix_val = false; + pending_value = PendingValue::CreateTextPendingValue(std::move(member)); } - std::string member = - absl_ports::StrJoin(members.rbegin(), members.rend(), - property_util::kPropertyPathSeparator); // 4. If pending_values_ is empty somehow, then our placeholder disappeared // somehow. @@ -542,7 +697,7 @@ void QueryVisitor::VisitMember(const MemberNode* node) { } pending_values_.pop(); - pending_values_.push(PendingValue::CreateTextPendingValue(std::move(member))); + pending_values_.push(std::move(pending_value)); } void QueryVisitor::VisitFunction(const FunctionNode* node) { @@ -594,59 +749,26 @@ void QueryVisitor::VisitFunction(const FunctionNode* node) { // `NOT search("foo", createList("prop1")) // AND search("bar", createList("prop1"))` void QueryVisitor::VisitUnaryOperator(const UnaryOperatorNode* node) { - if (node->operator_text() != "NOT") { + bool is_minus = node->operator_text() == "MINUS"; + if (node->operator_text() != "NOT" && !is_minus) { pending_error_ = absl_ports::UnimplementedError( absl_ports::StrCat("Visiting for unary operator ", node->operator_text(), " not implemented yet.")); return; } - // TODO(b/265312785) Consider implementing query optimization when we run into - // nested NOTs. This would allow us to simplify a query like "NOT (-foo)" to - // just "foo". This would also require more complicate rewrites as we would - // need to do things like rewrite "NOT (-a OR b)" as "a AND -b" and - // "NOT (price < 5)" as "price >= 5". - // 1. Put in a placeholder PendingValue - pending_values_.push(PendingValue()); - // Toggle whatever the current value of 'processing_not_' is before visiting - // the children. - processing_not_ = !processing_not_; - - // 2. Visit child - node->child()->Accept(this); - if (has_pending_error()) { - return; - } - - if (pending_values_.size() < 2) { - pending_error_ = absl_ports::InvalidArgumentError( - "Visit unary operator child didn't correctly add pending values."); - return; - } - - // 3. Retrieve the delegate iterator - auto iterator_or = PopPendingIterator(); - if (!iterator_or.ok()) { - pending_error_ = std::move(iterator_or).status(); - return; + libtextclassifier3::Status status; + if (expecting_numeric_arg_ && is_minus) { + // If the operator is a MINUS ('-') and we're at the child of a numeric + // comparator, then this must be a negation ('-3') + status = ProcessNegationOperator(node); + } else { + status = ProcessNotOperator(node); } - std::unique_ptr<DocHitInfoIterator> delegate = - std::move(iterator_or).ValueOrDie(); - // 4. Check for the placeholder. - if (!pending_values_.top().is_placeholder()) { - pending_error_ = absl_ports::InvalidArgumentError( - "Error processing arguments for node."); - return; + if (!status.ok()) { + pending_error_ = std::move(status); } - pending_values_.pop(); - - pending_values_.push(PendingValue(std::make_unique<DocHitInfoIteratorNot>( - std::move(delegate), document_store_.last_added_document_id()))); - - // Untoggle whatever the current value of 'processing_not_' is now that we've - // finished processing this NOT. - processing_not_ = !processing_not_; } void QueryVisitor::VisitNaryOperator(const NaryOperatorNode* node) { @@ -662,6 +784,12 @@ void QueryVisitor::VisitNaryOperator(const NaryOperatorNode* node) { pending_error_ = std::move(status); } return; + } else if (IsNumericComparator(node->operator_text())) { + libtextclassifier3::Status status = ProcessNumericComparator(node); + if (!status.ok()) { + pending_error_ = std::move(status); + } + return; } // 1. Put in a placeholder PendingValue @@ -677,9 +805,7 @@ void QueryVisitor::VisitNaryOperator(const NaryOperatorNode* node) { // 3. Retrieve the pending value for this node. libtextclassifier3::StatusOr<PendingValue> pending_value_or; - if (IsNumericComparator(node->operator_text())) { - pending_value_or = ProcessNumericComparator(node); - } else if (node->operator_text() == "AND") { + if (node->operator_text() == "AND") { pending_value_or = ProcessAndOperator(node); } else if (node->operator_text() == "OR") { pending_value_or = ProcessOrOperator(node); diff --git a/icing/query/advanced_query_parser/query-visitor.h b/icing/query/advanced_query_parser/query-visitor.h index b4e3dd7..7498457 100644 --- a/icing/query/advanced_query_parser/query-visitor.h +++ b/icing/query/advanced_query_parser/query-visitor.h @@ -119,19 +119,21 @@ class QueryVisitor : public AbstractSyntaxTreeVisitor { match_type_(match_type), needs_term_frequency_info_(needs_term_frequency_info), pending_property_restricts_(std::move(pending_property_restricts)), - processing_not_(processing_not) { + processing_not_(processing_not), + expecting_numeric_arg_(false) { RegisterFunctions(); } bool has_pending_error() const { return !pending_error_.ok(); } - // Creates a DocHitInfoIterator reflecting the provided term. Also populates, + // Creates a DocHitInfoIterator reflecting the provided term and whether the + // prefix operator has been applied to this term. Also populates, // property_query_terms_map_ and query_term_iterators_ as appropriate. // Returns: // - On success, a DocHitInfoIterator for the provided term // - INVALID_ARGUMENT if unable to create an iterator for the term. libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> - CreateTermIterator(const std::string& term); + CreateTermIterator(QueryTerm term); // Processes the PendingValue at the top of pending_values_, parses it into a // int64_t and pops the top. @@ -143,15 +145,17 @@ class QueryVisitor : public AbstractSyntaxTreeVisitor { // Processes the PendingValue at the top of pending_values_ and pops the top. // Returns: - // - On success, the string value stored in the text at the top + // - On success, the string value stored in the text at the top and a bool + // indicating whether or not the string value has a prefix operator. // - INVALID_ARGUMENT if pending_values_ is empty or doesn't hold a string. - libtextclassifier3::StatusOr<std::string> PopPendingStringValue(); + libtextclassifier3::StatusOr<QueryTerm> PopPendingStringValue(); // Processes the PendingValue at the top of pending_values_ and pops the top. // Returns: // - On success, the string value stored in the text at the top + // indicating whether or not the string value has a prefix operator. // - INVALID_ARGUMENT if pending_values_ is empty or doesn't hold a text. - libtextclassifier3::StatusOr<std::string> PopPendingTextValue(); + libtextclassifier3::StatusOr<QueryTerm> PopPendingTextValue(); // Processes the PendingValue at the top of pending_values_ and pops the top. // Returns: @@ -171,15 +175,35 @@ class QueryVisitor : public AbstractSyntaxTreeVisitor { libtextclassifier3::StatusOr<std::vector<std::unique_ptr<DocHitInfoIterator>>> PopAllPendingIterators(); + // Processes the unary operator node as a NOT operator. A NOT can have an + // operator type of "NOT" or "MINUS" + // + // RETURNS: + // - OK on success + // - INVALID_ARGUMENT if any errors are encountered while processing + // node->child + libtextclassifier3::Status ProcessNotOperator(const UnaryOperatorNode* node); + + // Processes the unary operator node as a negation operator. A negation + // operator should have an operator of type "MINUS" and it's children must + // resolve to a numeric value. + // + // RETURNS: + // - OK on success + // - INVALID_ARGUMENT if the node->child can't be resolved to a numeric + // value. + libtextclassifier3::Status ProcessNegationOperator( + const UnaryOperatorNode* node); + // Processes the NumericComparator represented by node. This must be called // *after* this node's children have been visited. The PendingValues added by // this node's children will be consumed by this function and the PendingValue // for this node will be returned. // Returns: - // - On success, then PendingValue representing this node and it's children. + // - On success, OK // - INVALID_ARGUMENT if unable to retrieve string value or int value // - NOT_FOUND if there is no entry in the numeric index for the property - libtextclassifier3::StatusOr<PendingValue> ProcessNumericComparator( + libtextclassifier3::Status ProcessNumericComparator( const NaryOperatorNode* node); // Processes the AND and OR operators represented by the node. This must be @@ -230,6 +254,12 @@ class QueryVisitor : public AbstractSyntaxTreeVisitor { // children cannot be resolved to a MEMBER or an iterator respectively. libtextclassifier3::Status ProcessHasOperator(const NaryOperatorNode* node); + // Returns the correct match type to apply based on both the match type and + // whether the prefix operator is currently present. + TermMatchType::Code GetTermMatchType(bool is_prefix) const { + return (is_prefix) ? TermMatchType::PREFIX : match_type_; + } + std::stack<PendingValue> pending_values_; libtextclassifier3::Status pending_error_; @@ -259,6 +289,10 @@ class QueryVisitor : public AbstractSyntaxTreeVisitor { // The stack of property restricts currently being processed by the visitor. PendingPropertyRestricts pending_property_restricts_; bool processing_not_; + + // Whether we are in the midst of processing a subtree that is expected to + // resolve to a numeric argument. + bool expecting_numeric_arg_; }; } // namespace lib diff --git a/icing/query/advanced_query_parser/query-visitor_test.cc b/icing/query/advanced_query_parser/query-visitor_test.cc index 12a6631..033e86b 100644 --- a/icing/query/advanced_query_parser/query-visitor_test.cc +++ b/icing/query/advanced_query_parser/query-visitor_test.cc @@ -389,8 +389,7 @@ TEST_P(QueryVisitorTest, SimpleGreaterThan) { ElementsAre(kDocumentId2)); } -// TODO(b/208654892) Properly handle negative numbers in query expressions. -TEST_P(QueryVisitorTest, DISABLED_IntMinLessThanEqual) { +TEST_P(QueryVisitorTest, IntMinLessThanEqual) { // Setup the numeric index with docs 0, 1 and 2 holding the values INT_MIN, // INT_MAX and INT_MIN + 1 respectively. int64_t int_min = std::numeric_limits<int64_t>::min(); @@ -648,8 +647,7 @@ TEST_P(QueryVisitorTest, NeverVisitedReturnsInvalid) { StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -// TODO(b/208654892) Properly handle negative numbers in query expressions. -TEST_P(QueryVisitorTest, DISABLED_IntMinLessThanInvalid) { +TEST_P(QueryVisitorTest, IntMinLessThanInvalid) { // Setup the numeric index with docs 0, 1 and 2 holding the values INT_MIN, // INT_MAX and INT_MIN + 1 respectively. int64_t int_min = std::numeric_limits<int64_t>::min(); @@ -724,6 +722,74 @@ TEST_P(QueryVisitorTest, NumericComparisonPropertyStringIsInvalid) { StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } +TEST_P(QueryVisitorTest, NumericComparatorDoesntAffectLaterTerms) { + ICING_ASSERT_OK(schema_store_->SetSchema( + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("type")) + .Build())); + + // Index three documents: + // - Doc0: ["-2", "-1", "1", "2"] and [-2, -1, 1, 2] + // - Doc1: [-1] + // - Doc2: ["2"] and [-1] + ICING_ASSERT_OK(document_store_->Put( + DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build())); + std::unique_ptr<NumericIndex<int64_t>::Editor> editor = + numeric_index_->Edit("price", kDocumentId0, kSectionId0); + editor->BufferKey(-2); + editor->BufferKey(-1); + editor->BufferKey(1); + editor->BufferKey(2); + std::move(*editor).IndexAllBufferedKeys(); + Index::Editor term_editor = index_->Edit( + kDocumentId0, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0); + term_editor.BufferTerm("-2"); + term_editor.BufferTerm("-1"); + term_editor.BufferTerm("1"); + term_editor.BufferTerm("2"); + term_editor.IndexAllBufferedTerms(); + + ICING_ASSERT_OK(document_store_->Put( + DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build())); + editor = numeric_index_->Edit("price", kDocumentId1, kSectionId0); + editor->BufferKey(-1); + std::move(*editor).IndexAllBufferedKeys(); + + ICING_ASSERT_OK(document_store_->Put( + DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build())); + editor = numeric_index_->Edit("price", kDocumentId2, kSectionId0); + editor->BufferKey(-1); + std::move(*editor).IndexAllBufferedKeys(); + term_editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX, + /*namespace_id=*/0); + term_editor.BufferTerm("2"); + term_editor.IndexAllBufferedTerms(); + + // Translating MINUS chars that are interpreted as NOTs, this query would be + // `price == -1 AND NOT 2` + // All documents should match `price == -1` + // Both docs 0 and 2 should be excluded because of the `NOT 2` clause + // doc0 has both a text and number entry for `-2`, neither of which should + // match. + std::string query = CreateQuery("price == -1 -2"); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, + ParseQueryHelper(query)); + QueryVisitor query_visitor( + index_.get(), numeric_index_.get(), document_store_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), + DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, + /*needs_term_frequency_info_=*/true); + root_node->Accept(&query_visitor); + ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, + std::move(query_visitor).ConsumeResults()); + EXPECT_THAT(query_results.features_in_use, + ElementsAre(kNumericSearchFeature)); + EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty()); + EXPECT_THAT(query_results.query_terms, IsEmpty()); + EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), + ElementsAre(kDocumentId1)); +} + TEST_P(QueryVisitorTest, SingleTermTermFrequencyEnabled) { // Setup the index with docs 0, 1 and 2 holding the values "foo", "foo" and // "bar" respectively. @@ -827,6 +893,169 @@ TEST_P(QueryVisitorTest, SingleTermTermFrequencyDisabled) { StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); } +TEST_P(QueryVisitorTest, SingleTermPrefix) { + // Setup the index with docs 0, 1 and 2 holding the values "foo", "foo" and + // "bar" respectively. + Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, + TERM_MATCH_PREFIX, /*namespace_id=*/0); + editor.BufferTerm("foo"); + editor.IndexAllBufferedTerms(); + + editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX, + /*namespace_id=*/0); + editor.BufferTerm("foo"); + editor.IndexAllBufferedTerms(); + + editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX, + /*namespace_id=*/0); + editor.BufferTerm("bar"); + editor.IndexAllBufferedTerms(); + + // An EXACT query for 'fo' won't match anything. + std::string query = CreateQuery("fo"); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, + ParseQueryHelper(query)); + QueryVisitor query_visitor( + index_.get(), numeric_index_.get(), document_store_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), + DocHitInfoIteratorFilter::Options(), TERM_MATCH_EXACT, + /*needs_term_frequency_info_=*/true); + root_node->Accept(&query_visitor); + ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, + std::move(query_visitor).ConsumeResults()); + EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("")); + EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("fo")); + EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), + UnorderedElementsAre("fo")); + EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), IsEmpty()); + + query = CreateQuery("fo*"); + ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); + QueryVisitor query_visitor_two( + index_.get(), numeric_index_.get(), document_store_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), + DocHitInfoIteratorFilter::Options(), TERM_MATCH_EXACT, + /*needs_term_frequency_info_=*/true); + root_node->Accept(&query_visitor_two); + ICING_ASSERT_OK_AND_ASSIGN(query_results, + std::move(query_visitor_two).ConsumeResults()); + EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("")); + EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("fo")); + EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), + UnorderedElementsAre("fo")); + EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), + ElementsAre(kDocumentId1, kDocumentId0)); +} + +TEST_P(QueryVisitorTest, PrefixOperatorAfterPropertyReturnsInvalid) { + std::string query = "price* < 2"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, + ParseQueryHelper(query)); + QueryVisitor query_visitor( + index_.get(), numeric_index_.get(), document_store_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), + DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, + /*needs_term_frequency_info_=*/true); + root_node->Accept(&query_visitor); + EXPECT_THAT(std::move(query_visitor).ConsumeResults(), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST_P(QueryVisitorTest, PrefixOperatorAfterNumericValueReturnsInvalid) { + std::string query = "price < 2*"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, + ParseQueryHelper(query)); + QueryVisitor query_visitor( + index_.get(), numeric_index_.get(), document_store_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), + DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, + /*needs_term_frequency_info_=*/true); + root_node->Accept(&query_visitor); + EXPECT_THAT(std::move(query_visitor).ConsumeResults(), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST_P(QueryVisitorTest, PrefixOperatorAfterPropertyRestrictReturnsInvalid) { + std::string query = "subject*:foo"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, + ParseQueryHelper(query)); + QueryVisitor query_visitor( + index_.get(), numeric_index_.get(), document_store_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), + DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, + /*needs_term_frequency_info_=*/true); + root_node->Accept(&query_visitor); + EXPECT_THAT(std::move(query_visitor).ConsumeResults(), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST_P(QueryVisitorTest, SegmentationWithPrefix) { + // Setup the index with docs 0, 1 and 2 holding the values ["foo", "ba"], + // ["foo", "ba"] and ["bar", "fo"] respectively. + Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, + TERM_MATCH_PREFIX, /*namespace_id=*/0); + editor.BufferTerm("foo"); + editor.BufferTerm("ba"); + editor.IndexAllBufferedTerms(); + + editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX, + /*namespace_id=*/0); + editor.BufferTerm("foo"); + editor.BufferTerm("ba"); + editor.IndexAllBufferedTerms(); + + editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX, + /*namespace_id=*/0); + editor.BufferTerm("bar"); + editor.BufferTerm("fo"); + editor.IndexAllBufferedTerms(); + + // An EXACT query for `ba?fo` will be lexed into a single TEXT token. + // The visitor will tokenize it into `ba` and `fo` (`?` is dropped because it + // is punctuation). Each document will match one and only one of these exact + // tokens. Therefore, nothing will match this query. + std::string query = CreateQuery("ba?fo"); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, + ParseQueryHelper(query)); + QueryVisitor query_visitor( + index_.get(), numeric_index_.get(), document_store_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), + DocHitInfoIteratorFilter::Options(), TERM_MATCH_EXACT, + /*needs_term_frequency_info_=*/true); + root_node->Accept(&query_visitor); + ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, + std::move(query_visitor).ConsumeResults()); + EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("")); + EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("ba", "fo")); + EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), + UnorderedElementsAre("ba", "fo")); + EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), IsEmpty()); + + // An EXACT query for `ba?fo*` will be lexed into a TEXT token and a TIMES + // token. + // The visitor will tokenize the TEXT into `ba` and `fo` (`?` is dropped + // because it is punctuation). The prefix operator should only apply to the + // final token `fo`. This will cause matches with docs 0 and 1 which contain + // "ba" and "foo". doc2 will not match because "ba" does not exactly match + // either "bar" or "fo". + query = CreateQuery("ba?fo*"); + ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); + QueryVisitor query_visitor_two( + index_.get(), numeric_index_.get(), document_store_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), + DocHitInfoIteratorFilter::Options(), TERM_MATCH_EXACT, + /*needs_term_frequency_info_=*/true); + root_node->Accept(&query_visitor_two); + ICING_ASSERT_OK_AND_ASSIGN(query_results, + std::move(query_visitor_two).ConsumeResults()); + EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("")); + EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("ba", "fo")); + EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), + UnorderedElementsAre("ba", "fo")); + EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), + ElementsAre(kDocumentId1, kDocumentId0)); +} + TEST_P(QueryVisitorTest, SingleVerbatimTerm) { // Setup the index with docs 0, 1 and 2 holding the values "foo:bar(baz)", // "foo:bar(baz)" and "bar:baz(foo)" respectively. @@ -867,6 +1096,46 @@ TEST_P(QueryVisitorTest, SingleVerbatimTerm) { ElementsAre(kDocumentId1, kDocumentId0)); } +TEST_P(QueryVisitorTest, SingleVerbatimTermPrefix) { + // Setup the index with docs 0, 1 and 2 holding the values "foo:bar(baz)", + // "foo:bar(abc)" and "bar:baz(foo)" respectively. + Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1, + TERM_MATCH_PREFIX, /*namespace_id=*/0); + editor.BufferTerm("foo:bar(baz)"); + editor.IndexAllBufferedTerms(); + + editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX, + /*namespace_id=*/0); + editor.BufferTerm("foo:bar(abc)"); + editor.IndexAllBufferedTerms(); + + editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX, + /*namespace_id=*/0); + editor.BufferTerm("bar:baz(foo)"); + editor.IndexAllBufferedTerms(); + + // Query for `"foo:bar("*`. This should match docs 0 and 1. + std::string query = CreateQuery("\"foo:bar(\"*"); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node, + ParseQueryHelper(query)); + QueryVisitor query_visitor( + index_.get(), numeric_index_.get(), document_store_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), + DocHitInfoIteratorFilter::Options(), TERM_MATCH_EXACT, + /*needs_term_frequency_info_=*/true); + root_node->Accept(&query_visitor); + ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, + std::move(query_visitor).ConsumeResults()); + EXPECT_THAT(query_results.features_in_use, + ElementsAre(kVerbatimSearchFeature)); + EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("")); + EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("foo:bar(")); + EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), + UnorderedElementsAre("foo:bar(")); + EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), + ElementsAre(kDocumentId1, kDocumentId0)); +} + // There are three primary cases to worry about for escaping: // // NOTE: The following comments use ` chars to denote the beginning and end of diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc index 2a7e108..35ee172 100644 --- a/icing/store/document-store.cc +++ b/icing/store/document-store.cc @@ -58,6 +58,7 @@ #include "icing/util/clock.h" #include "icing/util/crc32.h" #include "icing/util/data-loss.h" +#include "icing/util/encode-util.h" #include "icing/util/fingerprint-util.h" #include "icing/util/logging.h" #include "icing/util/status-macros.h" @@ -88,6 +89,17 @@ constexpr int32_t kUriMapperMaxSize = 36 * 1024 * 1024; // 36 MiB constexpr int32_t kNamespaceMapperMaxSize = 3 * 128 * 1024; // 384 KiB constexpr int32_t kCorpusMapperMaxSize = 3 * 128 * 1024; // 384 KiB +// Whether to use namespace id or namespace name to build up fingerprint for +// document_key_mapper_ and corpus_mapper_. +// Note: Changing this flag will require a reconstruction of the internal +// mappers in the document store. A easy way to trigger a rebuild is to change +// the kMagic value. +// +// TODO(b/259969017) Flip this flag to true at the time when we switch to use +// persistent hash map for document_key_mapper_ so that we just need one +// reconstruction of the internal mappers. +constexpr bool kNamespaceIdFingerprint = false; + DocumentWrapper CreateDocumentWrapper(DocumentProto&& document) { DocumentWrapper document_wrapper; *document_wrapper.mutable_document() = std::move(document); @@ -126,18 +138,40 @@ std::string MakeCorpusMapperFilename(const std::string& base_dir) { return absl_ports::StrCat(base_dir, "/", kCorpusIdMapperFilename); } -// TODO(adorokhine): This class internally uses an 8-byte fingerprint of the -// Key and stores the key/value in a file-backed-trie that adds an ~80 byte -// overhead per key. As we know that these fingerprints are always 8-bytes in -// length and that they're random, we might be able to store them more -// compactly. -std::string MakeFingerprint(std::string_view field1, std::string_view field2) { - // Using a 64-bit fingerprint to represent the key could lead to collisions. - // But, even with 200K unique keys, the probability of collision is about - // one-in-a-billion (https://en.wikipedia.org/wiki/Birthday_attack). - uint64_t fprint = - tc3farmhash::Fingerprint64(absl_ports::StrCat(field1, field2)); - return fingerprint_util::GetFingerprintString(fprint); +// This function will encode a namespace id into a fixed 3 bytes string. +std::string EncodeNamespaceId(NamespaceId namespace_id) { + // encoding should be 1 to 3 bytes based on the value of namespace_id. + std::string encoding = encode_util::EncodeIntToCString(namespace_id); + // Make encoding to fixed 3 bytes. + while (encoding.size() < 3) { + // DynamicTrie cannot handle keys with 0 as bytes, so we append it using 1, + // just like what we do in encode_util::EncodeIntToCString. + // + // The reason that this works is because DecodeIntToString decodes a byte + // value of 0x01 as 0x00. When EncodeIntToCString returns a namespaceid + // encoding that is less than 3 bytes, it means that the id contains + // unencoded leading 0x00. So here we're explicitly encoding those bytes as + // 0x01. + encoding.push_back(1); + } + return encoding; +} + +std::string MakeFingerprint(NamespaceId namespace_id, + std::string_view namespace_, + std::string_view uri_or_schema) { + if (!kNamespaceIdFingerprint) { + // Using a 64-bit fingerprint to represent the key could lead to collisions. + // But, even with 200K unique keys, the probability of collision is about + // one-in-a-billion (https://en.wikipedia.org/wiki/Birthday_attack). + uint64_t fprint = tc3farmhash::Fingerprint64( + absl_ports::StrCat(namespace_, uri_or_schema)); + return fingerprint_util::GetFingerprintString(fprint); + } else { + return absl_ports::StrCat(EncodeNamespaceId(namespace_id), + encode_util::EncodeIntToCString( + tc3farmhash::Fingerprint64(uri_or_schema))); + } } int64_t CalculateExpirationTimestampMs(int64_t creation_timestamp_ms, @@ -489,10 +523,16 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles( continue; } } + + ICING_ASSIGN_OR_RETURN( + NamespaceId namespace_id, + namespace_mapper_->GetOrPut(document_wrapper.document().namespace_(), + namespace_mapper_->num_keys())); + // Updates key mapper and document_id mapper with the new document DocumentId new_document_id = document_id_mapper_->num_elements(); ICING_RETURN_IF_ERROR(document_key_mapper_->Put( - MakeFingerprint(document_wrapper.document().namespace_(), + MakeFingerprint(namespace_id, document_wrapper.document().namespace_(), document_wrapper.document().uri()), new_document_id)); ICING_RETURN_IF_ERROR( @@ -517,14 +557,9 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles( schema_type_id = schema_type_id_or.ValueOrDie(); } - ICING_ASSIGN_OR_RETURN( - NamespaceId namespace_id, - namespace_mapper_->GetOrPut(document_wrapper.document().namespace_(), - namespace_mapper_->num_keys())); - // Update corpus maps std::string corpus = - MakeFingerprint(document_wrapper.document().namespace_(), + MakeFingerprint(namespace_id, document_wrapper.document().namespace_(), document_wrapper.document().schema()); ICING_ASSIGN_OR_RETURN( CorpusId corpusId, @@ -892,20 +927,21 @@ libtextclassifier3::StatusOr<DocumentId> DocumentStore::InternalPut( "some space."); } - ICING_RETURN_IF_ERROR(document_key_mapper_->Put( - MakeFingerprint(name_space, uri), new_document_id)); - ICING_RETURN_IF_ERROR(document_id_mapper_->Set(new_document_id, file_offset)); - // Update namespace maps ICING_ASSIGN_OR_RETURN( NamespaceId namespace_id, namespace_mapper_->GetOrPut(name_space, namespace_mapper_->num_keys())); + // Updates key mapper and document_id mapper + ICING_RETURN_IF_ERROR(document_key_mapper_->Put( + MakeFingerprint(namespace_id, name_space, uri), new_document_id)); + ICING_RETURN_IF_ERROR(document_id_mapper_->Set(new_document_id, file_offset)); + // Update corpus maps - ICING_ASSIGN_OR_RETURN( - CorpusId corpusId, - corpus_mapper_->GetOrPut(MakeFingerprint(name_space, schema), - corpus_mapper_->num_keys())); + ICING_ASSIGN_OR_RETURN(CorpusId corpusId, + corpus_mapper_->GetOrPut( + MakeFingerprint(namespace_id, name_space, schema), + corpus_mapper_->num_keys())); ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data, GetCorpusAssociatedScoreDataToUpdate(corpusId)); @@ -1023,17 +1059,21 @@ libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get( libtextclassifier3::StatusOr<DocumentId> DocumentStore::GetDocumentId( const std::string_view name_space, const std::string_view uri) const { - auto document_id_or = - document_key_mapper_->Get(MakeFingerprint(name_space, uri)); - if (!document_id_or.ok()) { - return absl_ports::Annotate( - document_id_or.status(), - absl_ports::StrCat("Failed to find DocumentId by key: ", name_space, - ", ", uri)); + auto namespace_id_or = namespace_mapper_->Get(name_space); + libtextclassifier3::Status status = namespace_id_or.status(); + if (status.ok()) { + NamespaceId namespace_id = namespace_id_or.ValueOrDie(); + auto document_id_or = document_key_mapper_->Get( + MakeFingerprint(namespace_id, name_space, uri)); + status = document_id_or.status(); + if (status.ok()) { + // Guaranteed to have a DocumentId now + return document_id_or.ValueOrDie(); + } } - - // Guaranteed to have a DocumentId now - return document_id_or.ValueOrDie(); + return absl_ports::Annotate( + status, absl_ports::StrCat( + "Failed to find DocumentId by key: ", name_space, ", ", uri)); } std::vector<std::string> DocumentStore::GetAllNamespaces() const { @@ -1158,7 +1198,9 @@ libtextclassifier3::StatusOr<NamespaceId> DocumentStore::GetNamespaceId( libtextclassifier3::StatusOr<CorpusId> DocumentStore::GetCorpusId( const std::string_view name_space, const std::string_view schema) const { - return corpus_mapper_->Get(MakeFingerprint(name_space, schema)); + ICING_ASSIGN_OR_RETURN(NamespaceId namespace_id, + namespace_mapper_->Get(name_space)); + return corpus_mapper_->Get(MakeFingerprint(namespace_id, name_space, schema)); } libtextclassifier3::StatusOr<int32_t> DocumentStore::GetResultGroupingEntryId( diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt index 2dc1352..5838a7b 100644 --- a/synced_AOSP_CL_number.txt +++ b/synced_AOSP_CL_number.txt @@ -1 +1 @@ -set(synced_AOSP_CL_number=513153289) +set(synced_AOSP_CL_number=513864120) |