aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTerry Wang <tytytyww@google.com>2023-03-03 10:43:50 -0800
committerTerry Wang <tytytyww@google.com>2023-03-03 10:45:16 -0800
commit49064c458678781fbf3db256751658728dc87740 (patch)
tree76f82b7a029bb630ee916e1eb9cb14afcdc8f84d
parente103b8ea56212b2a5abc082ce888843f19c7d567 (diff)
downloadicing-49064c458678781fbf3db256751658728dc87740.tar.gz
Update Icing from upstream.
Descriptions: ====================================================================== Modify the lexer to consider internal minus characters to be a part of TEXT segments (as hyphens). ====================================================================== Modify the query language to support an individual prefix operator '*'. ====================================================================== Fix the fingerprint issue for document key mapper in document store ====================================================================== Resolve issues around the handling of negative numbers. ====================================================================== Fix doc join info compiler warning/error ====================================================================== Codifies how colons are tokenized by the lexer Bug: 208654892 Bug: 268680462 Change-Id: I67169b2e656eeafb6500b5b03a6c46c3cc9c3d66
-rw-r--r--icing/icing-search-engine_schema_test.cc6
-rw-r--r--icing/icing-search-engine_search_test.cc6
-rw-r--r--icing/join/doc-join-info.cc7
-rw-r--r--icing/query/advanced_query_parser/abstract-syntax-tree.h20
-rw-r--r--icing/query/advanced_query_parser/function_test.cc72
-rw-r--r--icing/query/advanced_query_parser/lexer.cc29
-rw-r--r--icing/query/advanced_query_parser/lexer.h5
-rw-r--r--icing/query/advanced_query_parser/lexer_test.cc82
-rw-r--r--icing/query/advanced_query_parser/parser.cc71
-rw-r--r--icing/query/advanced_query_parser/parser.h3
-rw-r--r--icing/query/advanced_query_parser/parser_integration_test.cc6
-rw-r--r--icing/query/advanced_query_parser/parser_test.cc6
-rw-r--r--icing/query/advanced_query_parser/pending-value.cc13
-rw-r--r--icing/query/advanced_query_parser/pending-value.h39
-rw-r--r--icing/query/advanced_query_parser/query-visitor.cc348
-rw-r--r--icing/query/advanced_query_parser/query-visitor.h50
-rw-r--r--icing/query/advanced_query_parser/query-visitor_test.cc277
-rw-r--r--icing/store/document-store.cc118
-rw-r--r--synced_AOSP_CL_number.txt2
19 files changed, 888 insertions, 272 deletions
diff --git a/icing/icing-search-engine_schema_test.cc b/icing/icing-search-engine_schema_test.cc
index 59d25e5..38a0464 100644
--- a/icing/icing-search-engine_schema_test.cc
+++ b/icing/icing-search-engine_schema_test.cc
@@ -1987,6 +1987,8 @@ TEST_F(IcingSearchEngineSchemaTest, IcingShouldWorkFor64Sections) {
ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
ASSERT_THAT(icing.Put(email_collection).status(), ProtoIsOk());
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
const std::vector<std::string> query_terms = {
"first1", "last2", "email3@gmail.com", "000-000-001",
"body", "subject", "2022-08-02", "3\\:00"};
@@ -1995,8 +1997,6 @@ TEST_F(IcingSearchEngineSchemaTest, IcingShouldWorkFor64Sections) {
*expected_document.mutable_results()->Add()->mutable_document() =
email_collection;
for (const std::string& query_term : query_terms) {
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::PREFIX);
search_spec.set_query(query_term);
SearchResultProto actual_results =
icing.Search(search_spec, GetDefaultScoringSpec(),
@@ -2005,8 +2005,6 @@ TEST_F(IcingSearchEngineSchemaTest, IcingShouldWorkFor64Sections) {
EqualsSearchResultIgnoreStatsAndScores(expected_document));
}
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::PREFIX);
search_spec.set_query("foo");
SearchResultProto expected_no_documents;
expected_no_documents.mutable_status()->set_code(StatusProto::OK);
diff --git a/icing/icing-search-engine_search_test.cc b/icing/icing-search-engine_search_test.cc
index 3c32253..5648184 100644
--- a/icing/icing-search-engine_search_test.cc
+++ b/icing/icing-search-engine_search_test.cc
@@ -3002,12 +3002,6 @@ TEST_P(IcingSearchEngineSearchTest, SnippetSectionRestrict) {
}
TEST_P(IcingSearchEngineSearchTest, Hyphens) {
- // TODO(b/208654892): Fix issues with minus/hyphen chars.
- if (GetParam() ==
- SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY) {
- GTEST_SKIP()
- << "Advanced query doesn't properly support hyphens at this time.";
- }
IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
diff --git a/icing/join/doc-join-info.cc b/icing/join/doc-join-info.cc
index 9bef08a..3b06f01 100644
--- a/icing/join/doc-join-info.cc
+++ b/icing/join/doc-join-info.cc
@@ -25,13 +25,14 @@ namespace lib {
DocJoinInfo::DocJoinInfo(DocumentId document_id,
JoinablePropertyId joinable_property_id) {
- value_ = 0;
+ Value temp_value = 0;
bit_util::BitfieldSet(/*new_value=*/document_id,
/*lsb_offset=*/kJoinablePropertyIdBits,
- /*len=*/kDocumentIdBits, &value_);
+ /*len=*/kDocumentIdBits, &temp_value);
bit_util::BitfieldSet(/*new_value=*/joinable_property_id,
/*lsb_offset=*/0,
- /*len=*/kJoinablePropertyIdBits, &value_);
+ /*len=*/kJoinablePropertyIdBits, &temp_value);
+ value_ = temp_value;
}
DocumentId DocJoinInfo::document_id() const {
diff --git a/icing/query/advanced_query_parser/abstract-syntax-tree.h b/icing/query/advanced_query_parser/abstract-syntax-tree.h
index dc28ab6..d18f6ea 100644
--- a/icing/query/advanced_query_parser/abstract-syntax-tree.h
+++ b/icing/query/advanced_query_parser/abstract-syntax-tree.h
@@ -52,18 +52,24 @@ class Node {
class TerminalNode : public Node {
public:
- explicit TerminalNode(std::string value) : value_(std::move(value)) {}
+ explicit TerminalNode(std::string value, bool is_prefix)
+ : value_(std::move(value)), is_prefix_(is_prefix) {}
- const std::string& value() const { return value_; }
+ const std::string& value() const& { return value_; }
+ std::string value() && { return std::move(value_); }
+
+ bool is_prefix() const { return is_prefix_; }
private:
std::string value_;
+ bool is_prefix_;
};
class FunctionNameNode : public TerminalNode {
public:
explicit FunctionNameNode(std::string value)
- : TerminalNode(std::move(value)) {}
+ : TerminalNode(std::move(value), /*is_prefix=*/false) {}
+
void Accept(AbstractSyntaxTreeVisitor* visitor) const override {
visitor->VisitFunctionName(this);
}
@@ -71,7 +77,9 @@ class FunctionNameNode : public TerminalNode {
class StringNode : public TerminalNode {
public:
- explicit StringNode(std::string value) : TerminalNode(std::move(value)) {}
+ explicit StringNode(std::string value, bool is_prefix = false)
+ : TerminalNode(std::move(value), is_prefix) {}
+
void Accept(AbstractSyntaxTreeVisitor* visitor) const override {
visitor->VisitString(this);
}
@@ -79,7 +87,9 @@ class StringNode : public TerminalNode {
class TextNode : public TerminalNode {
public:
- explicit TextNode(std::string value) : TerminalNode(std::move(value)) {}
+ explicit TextNode(std::string value, bool is_prefix = false)
+ : TerminalNode(std::move(value), is_prefix) {}
+
void Accept(AbstractSyntaxTreeVisitor* visitor) const override {
visitor->VisitText(this);
}
diff --git a/icing/query/advanced_query_parser/function_test.cc b/icing/query/advanced_query_parser/function_test.cc
index f9aaed6..3b3ca40 100644
--- a/icing/query/advanced_query_parser/function_test.cc
+++ b/icing/query/advanced_query_parser/function_test.cc
@@ -65,7 +65,8 @@ TEST(FunctionTest, ParamNotWrongTypeFails) {
/*params=*/{Param(DataType::kString)}, TrivialEval()));
// foo(bar)
std::vector<PendingValue> args;
- args.push_back(PendingValue::CreateTextPendingValue("bar"));
+ args.push_back(PendingValue::CreateTextPendingValue(
+ QueryTerm{"bar", /*is_prefix_val=*/false}));
EXPECT_THAT(function.Eval(std::move(args)),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
@@ -78,7 +79,8 @@ TEST(FunctionTest, ParamRequiredArgSucceeds) {
// foo("bar")
std::vector<PendingValue> args;
- args.push_back(PendingValue::CreateStringPendingValue("bar"));
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"bar", /*is_prefix_val=*/false}));
ICING_ASSERT_OK_AND_ASSIGN(PendingValue val, function.Eval(std::move(args)));
EXPECT_THAT(val.is_placeholder(), IsTrue());
}
@@ -136,14 +138,17 @@ TEST(FunctionTest, MultipleArgsTrailingOptionalSucceeds) {
// foo("bar")
std::vector<PendingValue> args;
- args.push_back(PendingValue::CreateStringPendingValue("bar"));
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"bar", /*is_prefix_val=*/false}));
ICING_ASSERT_OK_AND_ASSIGN(PendingValue val, function.Eval(std::move(args)));
EXPECT_THAT(val.is_placeholder(), IsTrue());
// foo("bar", "baz")
args = std::vector<PendingValue>();
- args.push_back(PendingValue::CreateStringPendingValue("bar"));
- args.push_back(PendingValue::CreateStringPendingValue("baz"));
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"bar", /*is_prefix_val=*/false}));
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"baz", /*is_prefix_val=*/false}));
ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args)));
EXPECT_THAT(val.is_placeholder(), IsTrue());
}
@@ -159,22 +164,28 @@ TEST(FunctionTest, MultipleArgsTrailingVariableSucceeds) {
// foo("bar")
std::vector<PendingValue> args;
- args.push_back(PendingValue::CreateStringPendingValue("bar"));
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"bar", /*is_prefix_val=*/false}));
ICING_ASSERT_OK_AND_ASSIGN(PendingValue val, function.Eval(std::move(args)));
EXPECT_THAT(val.is_placeholder(), IsTrue());
// foo("bar", "baz")
args = std::vector<PendingValue>();
- args.push_back(PendingValue::CreateStringPendingValue("bar"));
- args.push_back(PendingValue::CreateStringPendingValue("baz"));
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"bar", /*is_prefix_val=*/false}));
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"baz", /*is_prefix_val=*/false}));
ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args)));
EXPECT_THAT(val.is_placeholder(), IsTrue());
// foo("bar", "baz", "bat")
args = std::vector<PendingValue>();
- args.push_back(PendingValue::CreateStringPendingValue("bar"));
- args.push_back(PendingValue::CreateStringPendingValue("baz"));
- args.push_back(PendingValue::CreateStringPendingValue("bat"));
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"bar", /*is_prefix_val=*/false}));
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"baz", /*is_prefix_val=*/false}));
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"bat", /*is_prefix_val=*/false}));
ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args)));
EXPECT_THAT(val.is_placeholder(), IsTrue());
}
@@ -205,20 +216,24 @@ TEST(FunctionTest, MultipleArgsOptionalBeforeOptionalSucceeds) {
// foo("bar")
args = std::vector<PendingValue>();
- args.push_back(PendingValue::CreateStringPendingValue("bar"));
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"bar", /*is_prefix_val=*/false}));
ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args)));
EXPECT_THAT(val.is_placeholder(), IsTrue());
// foo("bar", baz)
args = std::vector<PendingValue>();
- args.push_back(PendingValue::CreateStringPendingValue("bar"));
- args.push_back(PendingValue::CreateTextPendingValue("baz"));
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"bar", /*is_prefix_val=*/false}));
+ args.push_back(PendingValue::CreateTextPendingValue(
+ QueryTerm{"baz", /*is_prefix_val=*/false}));
ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args)));
EXPECT_THAT(val.is_placeholder(), IsTrue());
// foo(baz)
args = std::vector<PendingValue>();
- args.push_back(PendingValue::CreateTextPendingValue("baz"));
+ args.push_back(PendingValue::CreateTextPendingValue(
+ QueryTerm{"baz", /*is_prefix_val=*/false}));
EXPECT_THAT(function.Eval(std::move(args)),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
@@ -239,35 +254,44 @@ TEST(FunctionTest, MultipleArgsOptionalBeforeVariableSucceeds) {
// foo("bar")
args = std::vector<PendingValue>();
- args.push_back(PendingValue::CreateStringPendingValue("bar"));
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"bar", /*is_prefix_val=*/false}));
ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args)));
EXPECT_THAT(val.is_placeholder(), IsTrue());
// foo("bar", baz)
args = std::vector<PendingValue>();
- args.push_back(PendingValue::CreateStringPendingValue("bar"));
- args.push_back(PendingValue::CreateTextPendingValue("baz"));
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"bar", /*is_prefix_val=*/false}));
+ args.push_back(PendingValue::CreateTextPendingValue(
+ QueryTerm{"baz", /*is_prefix_val=*/false}));
ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args)));
EXPECT_THAT(val.is_placeholder(), IsTrue());
// foo("bar", baz, bat)
args = std::vector<PendingValue>();
- args.push_back(PendingValue::CreateStringPendingValue("bar"));
- args.push_back(PendingValue::CreateTextPendingValue("baz"));
- args.push_back(PendingValue::CreateTextPendingValue("bat"));
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"bar", /*is_prefix_val=*/false}));
+ args.push_back(PendingValue::CreateTextPendingValue(
+ QueryTerm{"baz", /*is_prefix_val=*/false}));
+ args.push_back(PendingValue::CreateTextPendingValue(
+ QueryTerm{"bat", /*is_prefix_val=*/false}));
ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args)));
EXPECT_THAT(val.is_placeholder(), IsTrue());
// foo(baz)
args = std::vector<PendingValue>();
- args.push_back(PendingValue::CreateTextPendingValue("baz"));
+ args.push_back(PendingValue::CreateTextPendingValue(
+ QueryTerm{"baz", /*is_prefix_val=*/false}));
EXPECT_THAT(function.Eval(std::move(args)),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
// foo(baz, bat)
args = std::vector<PendingValue>();
- args.push_back(PendingValue::CreateTextPendingValue("baz"));
- args.push_back(PendingValue::CreateTextPendingValue("bat"));
+ args.push_back(PendingValue::CreateTextPendingValue(
+ QueryTerm{"baz", /*is_prefix_val=*/false}));
+ args.push_back(PendingValue::CreateTextPendingValue(
+ QueryTerm{"bat", /*is_prefix_val=*/false}));
EXPECT_THAT(function.Eval(std::move(args)),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
diff --git a/icing/query/advanced_query_parser/lexer.cc b/icing/query/advanced_query_parser/lexer.cc
index 685f2e9..6cddd96 100644
--- a/icing/query/advanced_query_parser/lexer.cc
+++ b/icing/query/advanced_query_parser/lexer.cc
@@ -38,10 +38,24 @@ bool Lexer::ConsumeWhitespace() {
}
bool Lexer::ConsumeQuerySingleChar() {
- if (current_char_ != ':') {
- return false;
+ switch (current_char_) {
+ case ':':
+ tokens_.push_back({":", TokenType::COMPARATOR});
+ break;
+ case '*':
+ tokens_.push_back({"", TokenType::STAR});
+ break;
+ case '-':
+ if (in_text_) {
+ // MINUS ('-') is considered to be a part of a text segment if it is
+ // in the middle of a TEXT segment (ex. `foo-bar`).
+ return false;
+ }
+ tokens_.push_back({"", TokenType::MINUS});
+ break;
+ default:
+ return false;
}
- tokens_.push_back({":", TokenType::COMPARATOR});
Advance();
return true;
}
@@ -57,6 +71,9 @@ bool Lexer::ConsumeScoringSingleChar() {
case '/':
tokens_.push_back({"", TokenType::DIV});
break;
+ case '-':
+ tokens_.push_back({"", TokenType::MINUS});
+ break;
default:
return false;
}
@@ -72,9 +89,6 @@ bool Lexer::ConsumeGeneralSingleChar() {
case '.':
tokens_.push_back({"", TokenType::DOT});
break;
- case '-':
- tokens_.push_back({"", TokenType::MINUS});
- break;
case '(':
tokens_.push_back({"", TokenType::LPAREN});
break;
@@ -176,6 +190,7 @@ bool Lexer::Text() {
tokens_.push_back({"", TokenType::TEXT});
int token_index = tokens_.size() - 1;
while (!ConsumeNonText() && current_char_ != '\0') {
+ in_text_ = true;
// When getting a backslash in TEXT, unescape it by accepting its following
// character no matter which character it is, including white spaces,
// operator symbols, parentheses, etc.
@@ -194,6 +209,8 @@ bool Lexer::Text() {
// No need to break, since NonText() must be true at this point.
}
}
+ in_text_ = false;
+
if (language_ == Lexer::Language::QUERY) {
std::string &text = tokens_[token_index].text;
TokenType &type = tokens_[token_index].type;
diff --git a/icing/query/advanced_query_parser/lexer.h b/icing/query/advanced_query_parser/lexer.h
index a13ab60..f7f06dc 100644
--- a/icing/query/advanced_query_parser/lexer.h
+++ b/icing/query/advanced_query_parser/lexer.h
@@ -38,6 +38,7 @@ class Lexer {
DOT, // '.'
PLUS, // '+' Not allowed in QUERY language.
MINUS, // '-'
+ STAR, // '*' Not allowed in SCORING language.
TIMES, // '*' Not allowed in QUERY language.
DIV, // '/' Not allowed in QUERY language.
LPAREN, // '('
@@ -149,6 +150,10 @@ class Lexer {
int32_t current_index_ = -1;
char current_char_ = '\0';
std::vector<LexerToken> tokens_;
+
+ // Stores whether the lexer is currently inspecting a TEXT segment while
+ // handling current_char_.
+ bool in_text_ = false;
};
} // namespace lib
diff --git a/icing/query/advanced_query_parser/lexer_test.cc b/icing/query/advanced_query_parser/lexer_test.cc
index 86284fb..c6d215c 100644
--- a/icing/query/advanced_query_parser/lexer_test.cc
+++ b/icing/query/advanced_query_parser/lexer_test.cc
@@ -73,22 +73,26 @@ TEST(LexerTest, PrefixQuery) {
ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> tokens,
lexer->ExtractTokens());
EXPECT_THAT(tokens,
- ElementsAre(EqualsLexerToken("foo*", Lexer::TokenType::TEXT)));
+ ElementsAre(EqualsLexerToken("foo", Lexer::TokenType::TEXT),
+ EqualsLexerToken("", Lexer::TokenType::STAR)));
lexer = std::make_unique<Lexer>("fooAND*", Lexer::Language::QUERY);
ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
EXPECT_THAT(tokens,
- ElementsAre(EqualsLexerToken("fooAND*", Lexer::TokenType::TEXT)));
+ ElementsAre(EqualsLexerToken("fooAND", Lexer::TokenType::TEXT),
+ EqualsLexerToken("", Lexer::TokenType::STAR)));
lexer = std::make_unique<Lexer>("*ORfoo", Lexer::Language::QUERY);
ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
EXPECT_THAT(tokens,
- ElementsAre(EqualsLexerToken("*ORfoo", Lexer::TokenType::TEXT)));
+ ElementsAre(EqualsLexerToken("", Lexer::TokenType::STAR),
+ EqualsLexerToken("ORfoo", Lexer::TokenType::TEXT)));
lexer = std::make_unique<Lexer>("fooANDbar*", Lexer::Language::QUERY);
ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
- EXPECT_THAT(tokens, ElementsAre(EqualsLexerToken("fooANDbar*",
- Lexer::TokenType::TEXT)));
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("fooANDbar", Lexer::TokenType::TEXT),
+ EqualsLexerToken("", Lexer::TokenType::STAR)));
}
TEST(LexerTest, SimpleStringQuery) {
@@ -296,7 +300,8 @@ TEST(LexerTest, ComplexQuery) {
EqualsLexerToken("sender", Lexer::TokenType::TEXT),
EqualsLexerToken(":", Lexer::TokenType::COMPARATOR),
EqualsLexerToken(Lexer::TokenType::LPAREN),
- EqualsLexerToken("foo*", Lexer::TokenType::TEXT),
+ EqualsLexerToken("foo", Lexer::TokenType::TEXT),
+ EqualsLexerToken("", Lexer::TokenType::STAR),
EqualsLexerToken(Lexer::TokenType::AND),
EqualsLexerToken("bar", Lexer::TokenType::TEXT),
EqualsLexerToken(Lexer::TokenType::OR),
@@ -376,14 +381,13 @@ TEST(LexerTest, CJKT) {
lexer = std::make_unique<Lexer>("ញុំ&&ដើរទៅ||ធ្វើការ-រាល់ថ្ងៃ",
Lexer::Language::QUERY);
ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
- EXPECT_THAT(tokens,
- ElementsAre(EqualsLexerToken("ញុំ", Lexer::TokenType::TEXT),
- EqualsLexerToken(Lexer::TokenType::AND),
- EqualsLexerToken("ដើរទៅ", Lexer::TokenType::TEXT),
- EqualsLexerToken(Lexer::TokenType::OR),
- EqualsLexerToken("ធ្វើការ", Lexer::TokenType::TEXT),
- EqualsLexerToken(Lexer::TokenType::MINUS),
- EqualsLexerToken("រាល់ថ្ងៃ", Lexer::TokenType::TEXT)));
+ EXPECT_THAT(
+ tokens,
+ ElementsAre(EqualsLexerToken("ញុំ", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::AND),
+ EqualsLexerToken("ដើរទៅ", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::OR),
+ EqualsLexerToken("ធ្វើការ-រាល់ថ្ងៃ", Lexer::TokenType::TEXT)));
lexer = std::make_unique<Lexer>(
"나는"
@@ -477,7 +481,9 @@ TEST(LexerTest, ScoringArithmetic) {
lexer = std::make_unique<Lexer>("1+2*3/4", Lexer::Language::QUERY);
ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
EXPECT_THAT(tokens,
- ElementsAre(EqualsLexerToken("1+2*3/4", Lexer::TokenType::TEXT)));
+ ElementsAre(EqualsLexerToken("1+2", Lexer::TokenType::TEXT),
+ EqualsLexerToken("", Lexer::TokenType::STAR),
+ EqualsLexerToken("3/4", Lexer::TokenType::TEXT)));
}
// Currently, in scoring language, the lexer will view these logic operators as
@@ -609,6 +615,52 @@ TEST(LexerTest, ComplexScoring) {
EqualsLexerToken(Lexer::TokenType::RPAREN)));
}
+// foo:bar:baz is considered an invalid query as proposed in
+// http://go/appsearch-advanced-query-impl-plan#bookmark=id.yoeyepokmbc5 ; this
+// ensures that the lexer consistently tokenizes colons independently.
+TEST(LexerTest, NoAmbiguousTokenizing) {
+ // This is an invalid query; the lexer doesn't treat `bar:baz` as one token.
+ std::unique_ptr<Lexer> lexer =
+ std::make_unique<Lexer>("foo:bar:baz", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> invalidQueryTokens,
+ lexer->ExtractTokens());
+ EXPECT_THAT(invalidQueryTokens,
+ ElementsAre(EqualsLexerToken("foo", Lexer::TokenType::TEXT),
+ EqualsLexerToken(":", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("bar", Lexer::TokenType::TEXT),
+ EqualsLexerToken(":", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("baz", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("foo:\"bar:baz\"", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> validQueryTokens,
+ lexer->ExtractTokens());
+ EXPECT_THAT(
+ validQueryTokens,
+ ElementsAre(EqualsLexerToken("foo", Lexer::TokenType::TEXT),
+ EqualsLexerToken(":", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("bar:baz", Lexer::TokenType::STRING)));
+}
+
+TEST(LexerTest, WhiteSpacesDoNotAffectColonTokenization) {
+ std::unique_ptr<Lexer> lexer =
+ std::make_unique<Lexer>("a:b c : d e: f g :h", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> tokens,
+ lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("a", Lexer::TokenType::TEXT),
+ EqualsLexerToken(":", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("b", Lexer::TokenType::TEXT),
+ EqualsLexerToken("c", Lexer::TokenType::TEXT),
+ EqualsLexerToken(":", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("d", Lexer::TokenType::TEXT),
+ EqualsLexerToken("e", Lexer::TokenType::TEXT),
+ EqualsLexerToken(":", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("f", Lexer::TokenType::TEXT),
+ EqualsLexerToken("g", Lexer::TokenType::TEXT),
+ EqualsLexerToken(":", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("h", Lexer::TokenType::TEXT)));
+}
+
TEST(LexerTest, QueryShouldRejectTokensBeyondLimit) {
std::string query;
for (int i = 0; i < Lexer::kMaxNumTokens + 1; ++i) {
diff --git a/icing/query/advanced_query_parser/parser.cc b/icing/query/advanced_query_parser/parser.cc
index 086f038..0e4c78d 100644
--- a/icing/query/advanced_query_parser/parser.cc
+++ b/icing/query/advanced_query_parser/parser.cc
@@ -72,15 +72,24 @@ Parser::ConsumeFunctionName() {
return function_name_node;
}
+// stringElement
+// : STRING STAR?
libtextclassifier3::StatusOr<std::unique_ptr<StringNode>>
-Parser::ConsumeString() {
+Parser::ConsumeStringElement() {
if (!Match(Lexer::TokenType::STRING)) {
return absl_ports::InvalidArgumentError(
"Unable to consume token as STRING.");
}
- auto node = std::make_unique<StringNode>(std::move(current_token_->text));
+ std::string text = std::move(current_token_->text);
++current_token_;
- return node;
+
+ bool is_prefix = false;
+ if (Match(Lexer::TokenType::STAR)) {
+ is_prefix = true;
+ ++current_token_;
+ }
+
+ return std::make_unique<StringNode>(std::move(text), is_prefix);
}
libtextclassifier3::StatusOr<std::string> Parser::ConsumeComparator() {
@@ -95,25 +104,35 @@ libtextclassifier3::StatusOr<std::string> Parser::ConsumeComparator() {
// member
// : TEXT (DOT TEXT)* (DOT function)?
+// | TEXT STAR
// ;
libtextclassifier3::StatusOr<std::unique_ptr<MemberNode>>
Parser::ConsumeMember() {
ICING_ASSIGN_OR_RETURN(std::unique_ptr<TextNode> text_node, ConsumeText());
std::vector<std::unique_ptr<TextNode>> children;
- children.push_back(std::move(text_node));
-
- while (Match(Lexer::TokenType::DOT)) {
- Consume(Lexer::TokenType::DOT);
- if (MatchFunction()) {
- ICING_ASSIGN_OR_RETURN(std::unique_ptr<FunctionNode> function_node,
- ConsumeFunction());
- // Once a function is matched, we should exit the current rule based on
- // the grammar.
- return std::make_unique<MemberNode>(std::move(children),
- std::move(function_node));
- }
- ICING_ASSIGN_OR_RETURN(text_node, ConsumeText());
+
+ // Member could be either `TEXT (DOT TEXT)* (DOT function)?` or `TEXT STAR`
+ // at this point. So check for 'STAR' to differentiate the two cases.
+ if (Match(Lexer::TokenType::STAR)) {
+ Consume(Lexer::TokenType::STAR);
+ text_node = std::make_unique<TextNode>(std::move(*text_node).value(),
+ /*is_prefix=*/true);
+ children.push_back(std::move(text_node));
+ } else {
children.push_back(std::move(text_node));
+ while (Match(Lexer::TokenType::DOT)) {
+ Consume(Lexer::TokenType::DOT);
+ if (MatchFunction()) {
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<FunctionNode> function_node,
+ ConsumeFunction());
+ // Once a function is matched, we should exit the current rule based on
+ // the grammar.
+ return std::make_unique<MemberNode>(std::move(children),
+ std::move(function_node));
+ }
+ ICING_ASSIGN_OR_RETURN(text_node, ConsumeText());
+ children.push_back(std::move(text_node));
+ }
}
return std::make_unique<MemberNode>(std::move(children),
/*function=*/nullptr);
@@ -141,14 +160,14 @@ Parser::ConsumeFunction() {
}
// comparable
-// : STRING
+// : stringElement
// | member
// | function
// ;
libtextclassifier3::StatusOr<std::unique_ptr<Node>>
Parser::ConsumeComparable() {
if (Match(Lexer::TokenType::STRING)) {
- return ConsumeString();
+ return ConsumeStringElement();
} else if (MatchMember()) {
return ConsumeMember();
}
@@ -186,7 +205,7 @@ Parser::ConsumeArgs() {
}
// restriction
-// : comparable (COMPARATOR (comparable | composite))?
+// : comparable (COMPARATOR MINUS? (comparable | composite))?
// ;
// COMPARATOR will not be produced in Scoring Lexer.
libtextclassifier3::StatusOr<std::unique_ptr<Node>>
@@ -197,6 +216,12 @@ Parser::ConsumeRestriction() {
return comparable;
}
ICING_ASSIGN_OR_RETURN(std::string operator_text, ConsumeComparator());
+
+ bool has_minus = Match(Lexer::TokenType::MINUS);
+ if (has_minus) {
+ Consume(Lexer::TokenType::MINUS);
+ }
+
std::unique_ptr<Node> arg;
if (MatchComposite()) {
ICING_ASSIGN_OR_RETURN(arg, ConsumeComposite());
@@ -206,6 +231,11 @@ Parser::ConsumeRestriction() {
return absl_ports::InvalidArgumentError(
"ARG: must begin with LPAREN or FIRST(comparable)");
}
+
+ if (has_minus) {
+ arg = std::make_unique<UnaryOperatorNode>("MINUS", std::move(arg));
+ }
+
std::vector<std::unique_ptr<Node>> args;
args.push_back(std::move(comparable));
args.push_back(std::move(arg));
@@ -243,10 +273,11 @@ libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeTerm() {
} else {
if (Match(Lexer::TokenType::NOT)) {
Consume(Lexer::TokenType::NOT);
+ operator_text = "NOT";
} else {
Consume(Lexer::TokenType::MINUS);
+ operator_text = "MINUS";
}
- operator_text = "NOT";
}
ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> simple, ConsumeSimple());
return std::make_unique<UnaryOperatorNode>(operator_text, std::move(simple));
diff --git a/icing/query/advanced_query_parser/parser.h b/icing/query/advanced_query_parser/parser.h
index 330b8b9..a48c562 100644
--- a/icing/query/advanced_query_parser/parser.h
+++ b/icing/query/advanced_query_parser/parser.h
@@ -94,7 +94,8 @@ class Parser {
libtextclassifier3::StatusOr<std::unique_ptr<FunctionNameNode>>
ConsumeFunctionName();
- libtextclassifier3::StatusOr<std::unique_ptr<StringNode>> ConsumeString();
+ libtextclassifier3::StatusOr<std::unique_ptr<StringNode>>
+ ConsumeStringElement();
libtextclassifier3::StatusOr<std::string> ConsumeComparator();
diff --git a/icing/query/advanced_query_parser/parser_integration_test.cc b/icing/query/advanced_query_parser/parser_integration_test.cc
index 88cc079..fa1bd2e 100644
--- a/icing/query/advanced_query_parser/parser_integration_test.cc
+++ b/icing/query/advanced_query_parser/parser_integration_test.cc
@@ -189,7 +189,7 @@ TEST(ParserIntegrationTest, Minus) {
parser.ConsumeQuery());
// Expected AST:
- // NOT
+ // MINUS
// |
// member
// |
@@ -197,11 +197,11 @@ TEST(ParserIntegrationTest, Minus) {
SimpleVisitor visitor;
tree_root->Accept(&visitor);
// SimpleVisitor ordering
- // { text, member, NOT }
+ // { text, member, MINUS }
EXPECT_THAT(visitor.nodes(),
ElementsAre(EqualsNodeInfo("foo", NodeType::kText),
EqualsNodeInfo("", NodeType::kMember),
- EqualsNodeInfo("NOT", NodeType::kUnaryOperator)));
+ EqualsNodeInfo("MINUS", NodeType::kUnaryOperator)));
}
TEST(ParserIntegrationTest, Has) {
diff --git a/icing/query/advanced_query_parser/parser_test.cc b/icing/query/advanced_query_parser/parser_test.cc
index f997329..502dbd3 100644
--- a/icing/query/advanced_query_parser/parser_test.cc
+++ b/icing/query/advanced_query_parser/parser_test.cc
@@ -181,7 +181,7 @@ TEST(ParserTest, Minus) {
parser.ConsumeQuery());
// Expected AST:
- // NOT
+ // MINUS
// |
// member
// |
@@ -189,11 +189,11 @@ TEST(ParserTest, Minus) {
SimpleVisitor visitor;
tree_root->Accept(&visitor);
// SimpleVisitor ordering
- // { text, member, NOT }
+ // { text, member, MINUS }
EXPECT_THAT(visitor.nodes(),
ElementsAre(EqualsNodeInfo("foo", NodeType::kText),
EqualsNodeInfo("", NodeType::kMember),
- EqualsNodeInfo("NOT", NodeType::kUnaryOperator)));
+ EqualsNodeInfo("MINUS", NodeType::kUnaryOperator)));
}
TEST(ParserTest, Has) {
diff --git a/icing/query/advanced_query_parser/pending-value.cc b/icing/query/advanced_query_parser/pending-value.cc
index 306812d..903e12f 100644
--- a/icing/query/advanced_query_parser/pending-value.cc
+++ b/icing/query/advanced_query_parser/pending-value.cc
@@ -24,14 +24,19 @@ libtextclassifier3::Status PendingValue::ParseInt() {
} else if (data_type_ != DataType::kText) {
return absl_ports::InvalidArgumentError("Cannot parse value as LONG");
}
+ if (query_term_.is_prefix_val) {
+ return absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "Cannot use prefix operator '*' with numeric value: ",
+ query_term_.term));
+ }
char* value_end;
- long_val_ = std::strtoll(string_vals_.at(0).c_str(), &value_end, /*base=*/10);
- if (value_end != string_vals_.at(0).c_str() + string_vals_.at(0).length()) {
+ long_val_ = std::strtoll(query_term_.term.c_str(), &value_end, /*base=*/10);
+ if (value_end != query_term_.term.c_str() + query_term_.term.length()) {
return absl_ports::InvalidArgumentError(absl_ports::StrCat(
- "Unable to parse \"", string_vals_.at(0), "\" as number."));
+ "Unable to parse \"", query_term_.term, "\" as number."));
}
data_type_ = DataType::kLong;
- string_vals_.clear();
+ query_term_ = {"", false};
return libtextclassifier3::Status::OK;
}
diff --git a/icing/query/advanced_query_parser/pending-value.h b/icing/query/advanced_query_parser/pending-value.h
index 8a8704d..d18789d 100644
--- a/icing/query/advanced_query_parser/pending-value.h
+++ b/icing/query/advanced_query_parser/pending-value.h
@@ -36,14 +36,19 @@ enum class DataType {
kDocumentIterator,
};
+struct QueryTerm {
+ std::string term;
+ bool is_prefix_val;
+};
+
// A holder for intermediate results when processing child nodes.
struct PendingValue {
- static PendingValue CreateStringPendingValue(std::string str) {
- return PendingValue(std::move(str), DataType::kString);
+ static PendingValue CreateStringPendingValue(QueryTerm query_term) {
+ return PendingValue(std::move(query_term), DataType::kString);
}
- static PendingValue CreateTextPendingValue(std::string text) {
- return PendingValue(std::move(text), DataType::kText);
+ static PendingValue CreateTextPendingValue(QueryTerm query_term) {
+ return PendingValue(std::move(query_term), DataType::kText);
}
PendingValue() : data_type_(DataType::kNone) {}
@@ -82,22 +87,22 @@ struct PendingValue {
return std::move(string_vals_);
}
- libtextclassifier3::StatusOr<const std::string*> string_val() const& {
+ libtextclassifier3::StatusOr<const QueryTerm*> string_val() const& {
ICING_RETURN_IF_ERROR(CheckDataType(DataType::kString));
- return &string_vals_.at(0);
+ return &query_term_;
}
- libtextclassifier3::StatusOr<std::string> string_val() && {
+ libtextclassifier3::StatusOr<QueryTerm> string_val() && {
ICING_RETURN_IF_ERROR(CheckDataType(DataType::kString));
- return std::move(string_vals_.at(0));
+ return std::move(query_term_);
}
- libtextclassifier3::StatusOr<const std::string*> text_val() const& {
+ libtextclassifier3::StatusOr<const QueryTerm*> text_val() const& {
ICING_RETURN_IF_ERROR(CheckDataType(DataType::kText));
- return &string_vals_.at(0);
+ return &query_term_;
}
- libtextclassifier3::StatusOr<std::string> text_val() && {
+ libtextclassifier3::StatusOr<QueryTerm> text_val() && {
ICING_RETURN_IF_ERROR(CheckDataType(DataType::kText));
- return std::move(string_vals_.at(0));
+ return std::move(query_term_);
}
libtextclassifier3::StatusOr<int64_t> long_val() {
@@ -119,8 +124,8 @@ struct PendingValue {
DataType data_type() const { return data_type_; }
private:
- explicit PendingValue(std::string str, DataType data_type)
- : string_vals_({std::move(str)}), data_type_(data_type) {}
+ explicit PendingValue(QueryTerm query_term, DataType data_type)
+ : query_term_({std::move(query_term)}), data_type_(data_type) {}
libtextclassifier3::Status CheckDataType(DataType required_data_type) const {
if (data_type_ == required_data_type) {
@@ -136,10 +141,12 @@ struct PendingValue {
// iterator_ will be populated when data_type_ is kDocumentIterator.
std::unique_ptr<DocHitInfoIterator> iterator_;
- // string_vals_ will be populated when data_type_ is kString, kText and
- // kStringList.
+ // string_vals_ will be populated when data_type_ is kStringList.
std::vector<std::string> string_vals_;
+ // query_term_ will be populated when data_type_ is kString or kText
+ QueryTerm query_term_;
+
// long_val_ will be populated when data_type_ is kLong - after a successful
// call to ParseInt.
int64_t long_val_;
diff --git a/icing/query/advanced_query_parser/query-visitor.cc b/icing/query/advanced_query_parser/query-visitor.cc
index 71e1c7c..9df1264 100644
--- a/icing/query/advanced_query_parser/query-visitor.cc
+++ b/icing/query/advanced_query_parser/query-visitor.cc
@@ -54,8 +54,8 @@ struct CreateList {
std::vector<std::string> values;
values.reserve(args.size());
for (PendingValue& arg : args) {
- std::string val = std::move(arg).string_val().ValueOrDie();
- values.push_back(std::move(val));
+ QueryTerm val = std::move(arg).string_val().ValueOrDie();
+ values.push_back(std::move(val.term));
}
return PendingValue(std::move(values));
}
@@ -168,16 +168,17 @@ void QueryVisitor::PendingPropertyRestricts::AddValidRestricts(
}
libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
-QueryVisitor::CreateTermIterator(const std::string& term) {
+QueryVisitor::CreateTermIterator(QueryTerm query_term) {
+ TermMatchType::Code match_type = GetTermMatchType(query_term.is_prefix_val);
if (!processing_not_) {
// 1. Add term to property_query_terms_map
if (pending_property_restricts_.has_active_property_restricts()) {
for (const std::string& property_restrict :
pending_property_restricts_.active_property_restricts()) {
- property_query_terms_map_[property_restrict].insert(term);
+ property_query_terms_map_[property_restrict].insert(query_term.term);
}
} else {
- property_query_terms_map_[""].insert(term);
+ property_query_terms_map_[""].insert(query_term.term);
}
// 2. If needed add term iterator to query_term_iterators_ map.
@@ -186,22 +187,22 @@ QueryVisitor::CreateTermIterator(const std::string& term) {
// pass it into index.GetIterator
ICING_ASSIGN_OR_RETURN(
std::unique_ptr<DocHitInfoIterator> term_iterator,
- index_.GetIterator(term, /*term_start_index=*/0,
+ index_.GetIterator(query_term.term, /*term_start_index=*/0,
/*unnormalized_term_length=*/0, kSectionIdMaskAll,
- match_type_, needs_term_frequency_info_));
- query_term_iterators_[term] = std::make_unique<DocHitInfoIteratorFilter>(
- std::move(term_iterator), &document_store_, &schema_store_,
- filter_options_);
+ match_type, needs_term_frequency_info_));
+ query_term_iterators_[query_term.term] =
+ std::make_unique<DocHitInfoIteratorFilter>(
+ std::move(term_iterator), &document_store_, &schema_store_,
+ filter_options_);
}
}
// 3. Add the term iterator.
- // TODO(b/208654892): Add support for the prefix operator (*).
// TODO(b/152934343) Save "term start index" into Node and PendingValue and
// pass it into index.GetIterator
- return index_.GetIterator(term, /*term_start_index=*/0,
+ return index_.GetIterator(query_term.term, /*term_start_index=*/0,
/*unnormalized_term_length=*/0, kSectionIdMaskAll,
- match_type_, needs_term_frequency_info_);
+ match_type, needs_term_frequency_info_);
}
void QueryVisitor::RegisterFunctions() {
@@ -248,8 +249,8 @@ libtextclassifier3::StatusOr<PendingValue> QueryVisitor::SearchFunction(
// The first arg is guaranteed to be a STRING at this point. It should be safe
// to call ValueOrDie.
- const std::string* query = args.at(0).string_val().ValueOrDie();
- Lexer lexer(*query, Lexer::Language::QUERY);
+ const QueryTerm* query = args.at(0).string_val().ValueOrDie();
+ Lexer lexer(query->term, Lexer::Language::QUERY);
ICING_ASSIGN_OR_RETURN(std::vector<Lexer::LexerToken> lexer_tokens,
lexer.ExtractTokens());
@@ -307,22 +308,21 @@ libtextclassifier3::StatusOr<int64_t> QueryVisitor::PopPendingIntValue() {
return int_value;
}
-libtextclassifier3::StatusOr<std::string>
-QueryVisitor::PopPendingStringValue() {
+libtextclassifier3::StatusOr<QueryTerm> QueryVisitor::PopPendingStringValue() {
if (pending_values_.empty()) {
return absl_ports::InvalidArgumentError("Unable to retrieve string value.");
}
- ICING_ASSIGN_OR_RETURN(std::string string_value,
+ ICING_ASSIGN_OR_RETURN(QueryTerm string_value,
std::move(pending_values_.top()).string_val());
pending_values_.pop();
return string_value;
}
-libtextclassifier3::StatusOr<std::string> QueryVisitor::PopPendingTextValue() {
+libtextclassifier3::StatusOr<QueryTerm> QueryVisitor::PopPendingTextValue() {
if (pending_values_.empty()) {
return absl_ports::InvalidArgumentError("Unable to retrieve text value.");
}
- ICING_ASSIGN_OR_RETURN(std::string text_value,
+ ICING_ASSIGN_OR_RETURN(QueryTerm text_value,
std::move(pending_values_.top()).text_val());
pending_values_.pop();
return text_value;
@@ -340,19 +340,33 @@ QueryVisitor::PopPendingIterator() {
return iterator;
} else if (pending_values_.top().data_type() == DataType::kString) {
features_.insert(kVerbatimSearchFeature);
- ICING_ASSIGN_OR_RETURN(std::string value, PopPendingStringValue());
- return CreateTermIterator(std::move(value));
+ ICING_ASSIGN_OR_RETURN(QueryTerm string_value, PopPendingStringValue());
+ return CreateTermIterator(std::move(string_value));
} else {
- ICING_ASSIGN_OR_RETURN(std::string value, PopPendingTextValue());
+ ICING_ASSIGN_OR_RETURN(QueryTerm text_value, PopPendingTextValue());
ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> token_itr,
- tokenizer_.Tokenize(value));
+ tokenizer_.Tokenize(text_value.term));
std::string normalized_term;
std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
- while (token_itr->Advance()) {
- for (const Token& token : token_itr->GetTokens()) {
+ // The tokenizer will produce 1+ tokens out of the text. The prefix operator
+ // only applies to the final token.
+ bool reached_final_token = !token_itr->Advance();
+ while (!reached_final_token) {
+ std::vector<Token> tokens = token_itr->GetTokens();
+ reached_final_token = !token_itr->Advance();
+
+ // The tokenizer iterator iterates between token groups. In practice, the
+ // tokenizer used with QueryVisitor (PlainTokenizer) will always only
+ // produce a single token per token group.
+ // For simplicity, we will apply the prefix operator to *all* tokens
+ // in the final token group.
+ for (const Token& token : tokens) {
normalized_term = normalizer_.NormalizeTerm(token.text);
- ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> iterator,
- CreateTermIterator(std::move(normalized_term)));
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<DocHitInfoIterator> iterator,
+ CreateTermIterator(
+ QueryTerm{std::move(normalized_term),
+ reached_final_token && text_value.is_prefix_val}));
iterators.push_back(std::move(iterator));
}
}
@@ -382,27 +396,65 @@ QueryVisitor::PopAllPendingIterators() {
return iterators;
}
-libtextclassifier3::StatusOr<PendingValue>
-QueryVisitor::ProcessNumericComparator(const NaryOperatorNode* node) {
- // 1. The children should have been processed and added their outputs to
- // pending_values_. Time to process them.
- // The first two pending values should be the int value and the property.
+libtextclassifier3::Status QueryVisitor::ProcessNumericComparator(
+ const NaryOperatorNode* node) {
+ if (node->children().size() != 2) {
+ return absl_ports::InvalidArgumentError("Expected 2 children.");
+ }
+
+ // 1. Put in a placeholder PendingValue
+ pending_values_.push(PendingValue());
+
+ // 2. The first child is the property to restrict by.
+ node->children().at(0)->Accept(this);
+ if (has_pending_error()) {
+ return std::move(pending_error_);
+ }
+ ICING_ASSIGN_OR_RETURN(QueryTerm text_value, PopPendingTextValue());
+
+ if (text_value.is_prefix_val) {
+ return absl_ports::InvalidArgumentError(
+ "Cannot use prefix operator '*' with a property name!");
+ }
+
+ // If there is an active property restrict and this property is not present in
+ // in the active restrict set, then it's not satisfiable.
+ if (pending_property_restricts_.has_active_property_restricts() &&
+ pending_property_restricts_.active_property_restricts().find(
+ text_value.term) ==
+ pending_property_restricts_.active_property_restricts().end()) {
+ // The property restrict can't be satisfiable. Pop the placeholder that was
+ // just added and push a FALSE iterator.
+ pending_property_restricts_.PopRestricts();
+ pending_values_.pop();
+ pending_values_.push(
+ PendingValue(std::make_unique<DocHitInfoIteratorNone>()));
+ return libtextclassifier3::Status::OK;
+ }
+
+ // 3. The second child should be parseable as an integer value.
+ expecting_numeric_arg_ = true;
+ node->children().at(1)->Accept(this);
+ expecting_numeric_arg_ = false;
ICING_ASSIGN_OR_RETURN(int64_t int_value, PopPendingIntValue());
- ICING_ASSIGN_OR_RETURN(std::string property, PopPendingTextValue());
- // 2. Create the iterator.
+ // 4. Check for the placeholder.
+ if (!pending_values_.top().is_placeholder()) {
+ return absl_ports::InvalidArgumentError(
+ "Error processing arguments for node.");
+ }
+ pending_values_.pop();
+
+ // 5. Create the iterator and push it onto pending_values_.
ICING_ASSIGN_OR_RETURN(Int64Range range,
GetInt64Range(node->operator_text(), int_value));
- auto iterator_or =
- numeric_index_.GetIterator(property, range.low, range.high);
- if (!iterator_or.ok()) {
- return std::move(iterator_or).status();
- }
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<DocHitInfoIterator> iterator,
+ numeric_index_.GetIterator(text_value.term, range.low, range.high));
features_.insert(kNumericSearchFeature);
- std::unique_ptr<DocHitInfoIterator> iterator =
- std::move(iterator_or).ValueOrDie();
- return PendingValue(std::move(iterator));
+ pending_values_.push(PendingValue(std::move(iterator)));
+ return libtextclassifier3::Status::OK;
}
libtextclassifier3::StatusOr<PendingValue> QueryVisitor::ProcessAndOperator(
@@ -421,6 +473,85 @@ libtextclassifier3::StatusOr<PendingValue> QueryVisitor::ProcessOrOperator(
return PendingValue(CreateOrIterator(std::move(iterators)));
}
+libtextclassifier3::Status QueryVisitor::ProcessNegationOperator(
+ const UnaryOperatorNode* node) {
+ // 1. Put in a placeholder PendingValue
+ pending_values_.push(PendingValue());
+
+ // 2. Visit child
+ node->child()->Accept(this);
+ if (has_pending_error()) {
+ return std::move(pending_error_);
+ }
+
+ if (pending_values_.size() < 2) {
+ return absl_ports::InvalidArgumentError(
+ "Visit unary operator child didn't correctly add pending values.");
+ }
+
+ // 3. We want to preserve the original text of the integer value, append our
+ // minus and *then* parse as an int.
+ ICING_ASSIGN_OR_RETURN(QueryTerm int_text_val, PopPendingTextValue());
+ int_text_val.term = absl_ports::StrCat("-", int_text_val.term);
+ PendingValue pending_value =
+ PendingValue::CreateTextPendingValue(std::move(int_text_val));
+ ICING_RETURN_IF_ERROR(pending_value.long_val());
+
+ // We've parsed our integer value successfully. Pop our placeholder, push it
+ // on to the stack and return successfully.
+ if (!pending_values_.top().is_placeholder()) {
+ return absl_ports::InvalidArgumentError(
+ "Error processing arguments for node.");
+ }
+ pending_values_.pop();
+ pending_values_.push(std::move(pending_value));
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status QueryVisitor::ProcessNotOperator(
+ const UnaryOperatorNode* node) {
+ // TODO(b/265312785) Consider implementing query optimization when we run into
+ // nested NOTs. This would allow us to simplify a query like "NOT (-foo)" to
+ // just "foo". This would also require more complicate rewrites as we would
+ // need to do things like rewrite "NOT (-a OR b)" as "a AND -b" and
+ // "NOT (price < 5)" as "price >= 5".
+ // 1. Put in a placeholder PendingValue
+ pending_values_.push(PendingValue());
+ // Toggle whatever the current value of 'processing_not_' is before visiting
+ // the children.
+ processing_not_ = !processing_not_;
+
+ // 2. Visit child
+ node->child()->Accept(this);
+ if (has_pending_error()) {
+ return std::move(pending_error_);
+ }
+
+ if (pending_values_.size() < 2) {
+ return absl_ports::InvalidArgumentError(
+ "Visit unary operator child didn't correctly add pending values.");
+ }
+
+ // 3. Retrieve the delegate iterator
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> delegate,
+ PopPendingIterator());
+
+ // 4. Check for the placeholder.
+ if (!pending_values_.top().is_placeholder()) {
+ return absl_ports::InvalidArgumentError(
+ "Error processing arguments for node.");
+ }
+ pending_values_.pop();
+
+ pending_values_.push(PendingValue(std::make_unique<DocHitInfoIteratorNot>(
+ std::move(delegate), document_store_.last_added_document_id())));
+
+ // Untoggle whatever the current value of 'processing_not_' is now that we've
+ // finished processing this NOT.
+ processing_not_ = !processing_not_;
+ return libtextclassifier3::Status::OK;
+}
+
libtextclassifier3::Status QueryVisitor::ProcessHasOperator(
const NaryOperatorNode* node) {
if (node->children().size() != 2) {
@@ -435,8 +566,12 @@ libtextclassifier3::Status QueryVisitor::ProcessHasOperator(
if (has_pending_error()) {
return pending_error_;
}
- ICING_ASSIGN_OR_RETURN(std::string property, PopPendingTextValue());
- pending_property_restricts_.AddValidRestricts({property});
+ ICING_ASSIGN_OR_RETURN(QueryTerm text_value, PopPendingTextValue());
+ if (text_value.is_prefix_val) {
+ return absl_ports::InvalidArgumentError(
+ "Cannot use prefix operator '*' with a property name!");
+ }
+ pending_property_restricts_.AddValidRestricts({text_value.term});
// Just added a restrict - if there are no active property restricts then that
// be because this restrict is unsatisfiable.
@@ -466,7 +601,7 @@ libtextclassifier3::Status QueryVisitor::ProcessHasOperator(
pending_values_.pop();
pending_property_restricts_.PopRestricts();
- std::set<std::string> property_restricts = {std::move(property)};
+ std::set<std::string> property_restricts = {std::move(text_value.term)};
pending_values_.push(
PendingValue(std::make_unique<DocHitInfoIteratorSectionRestrict>(
std::move(delegate), &document_store_, &schema_store_,
@@ -487,16 +622,16 @@ void QueryVisitor::VisitString(const StringNode* node) {
return;
}
std::string unescaped_string = std::move(unescaped_string_or).ValueOrDie();
- pending_values_.push(
- PendingValue::CreateStringPendingValue(std::move(unescaped_string)));
+ pending_values_.push(PendingValue::CreateStringPendingValue(
+ QueryTerm{std::move(unescaped_string), node->is_prefix()}));
}
void QueryVisitor::VisitText(const TextNode* node) {
// TEXT nodes could either be a term (and will become DocHitInfoIteratorTerm)
// or a property name. As such, we just push the TEXT value into pending
// values and determine which it is at a later point.
- pending_values_.push(
- PendingValue::CreateTextPendingValue(std::move(node->value())));
+ pending_values_.push(PendingValue::CreateTextPendingValue(
+ QueryTerm{std::move(node->value()), node->is_prefix()}));
}
void QueryVisitor::VisitMember(const MemberNode* node) {
@@ -517,21 +652,41 @@ void QueryVisitor::VisitMember(const MemberNode* node) {
}
}
- // 3. The children should have been processed and added their outputs to
- // pending_values_. Time to process them.
- libtextclassifier3::StatusOr<std::string> member_or;
- std::vector<std::string> members;
- while (!pending_values_.empty() && !pending_values_.top().is_placeholder()) {
- member_or = PopPendingTextValue();
- if (!member_or.ok()) {
- pending_error_ = std::move(member_or).status();
- return;
+ // 3. Now process the results of the children and produce a single pending
+ // value representing this member.
+ PendingValue pending_value;
+ if (node->children().size() == 1) {
+ // 3a. This member only has a single child, then the pending value produced
+ // by that child is the final value produced by this member.
+ pending_value = std::move(pending_values_.top());
+ pending_values_.pop();
+ } else {
+ // 3b. Retrieve the values of all children and concatenate them into a
+ // single value.
+ libtextclassifier3::StatusOr<QueryTerm> member_or;
+ std::vector<std::string> members;
+ QueryTerm text_val;
+ while (!pending_values_.empty() &&
+ !pending_values_.top().is_placeholder()) {
+ member_or = PopPendingTextValue();
+ if (!member_or.ok()) {
+ pending_error_ = std::move(member_or).status();
+ return;
+ }
+ text_val = std::move(member_or).ValueOrDie();
+ if (text_val.is_prefix_val) {
+ pending_error_ = absl_ports::InvalidArgumentError(
+ "Cannot use prefix operator '*' within a property name!");
+ return;
+ }
+ members.push_back(std::move(text_val.term));
}
- members.push_back(std::move(member_or).ValueOrDie());
+ QueryTerm member;
+ member.term = absl_ports::StrJoin(members.rbegin(), members.rend(),
+ property_util::kPropertyPathSeparator);
+ member.is_prefix_val = false;
+ pending_value = PendingValue::CreateTextPendingValue(std::move(member));
}
- std::string member =
- absl_ports::StrJoin(members.rbegin(), members.rend(),
- property_util::kPropertyPathSeparator);
// 4. If pending_values_ is empty somehow, then our placeholder disappeared
// somehow.
@@ -542,7 +697,7 @@ void QueryVisitor::VisitMember(const MemberNode* node) {
}
pending_values_.pop();
- pending_values_.push(PendingValue::CreateTextPendingValue(std::move(member)));
+ pending_values_.push(std::move(pending_value));
}
void QueryVisitor::VisitFunction(const FunctionNode* node) {
@@ -594,59 +749,26 @@ void QueryVisitor::VisitFunction(const FunctionNode* node) {
// `NOT search("foo", createList("prop1"))
// AND search("bar", createList("prop1"))`
void QueryVisitor::VisitUnaryOperator(const UnaryOperatorNode* node) {
- if (node->operator_text() != "NOT") {
+ bool is_minus = node->operator_text() == "MINUS";
+ if (node->operator_text() != "NOT" && !is_minus) {
pending_error_ = absl_ports::UnimplementedError(
absl_ports::StrCat("Visiting for unary operator ",
node->operator_text(), " not implemented yet."));
return;
}
- // TODO(b/265312785) Consider implementing query optimization when we run into
- // nested NOTs. This would allow us to simplify a query like "NOT (-foo)" to
- // just "foo". This would also require more complicate rewrites as we would
- // need to do things like rewrite "NOT (-a OR b)" as "a AND -b" and
- // "NOT (price < 5)" as "price >= 5".
- // 1. Put in a placeholder PendingValue
- pending_values_.push(PendingValue());
- // Toggle whatever the current value of 'processing_not_' is before visiting
- // the children.
- processing_not_ = !processing_not_;
-
- // 2. Visit child
- node->child()->Accept(this);
- if (has_pending_error()) {
- return;
- }
-
- if (pending_values_.size() < 2) {
- pending_error_ = absl_ports::InvalidArgumentError(
- "Visit unary operator child didn't correctly add pending values.");
- return;
- }
-
- // 3. Retrieve the delegate iterator
- auto iterator_or = PopPendingIterator();
- if (!iterator_or.ok()) {
- pending_error_ = std::move(iterator_or).status();
- return;
+ libtextclassifier3::Status status;
+ if (expecting_numeric_arg_ && is_minus) {
+ // If the operator is a MINUS ('-') and we're at the child of a numeric
+ // comparator, then this must be a negation ('-3')
+ status = ProcessNegationOperator(node);
+ } else {
+ status = ProcessNotOperator(node);
}
- std::unique_ptr<DocHitInfoIterator> delegate =
- std::move(iterator_or).ValueOrDie();
- // 4. Check for the placeholder.
- if (!pending_values_.top().is_placeholder()) {
- pending_error_ = absl_ports::InvalidArgumentError(
- "Error processing arguments for node.");
- return;
+ if (!status.ok()) {
+ pending_error_ = std::move(status);
}
- pending_values_.pop();
-
- pending_values_.push(PendingValue(std::make_unique<DocHitInfoIteratorNot>(
- std::move(delegate), document_store_.last_added_document_id())));
-
- // Untoggle whatever the current value of 'processing_not_' is now that we've
- // finished processing this NOT.
- processing_not_ = !processing_not_;
}
void QueryVisitor::VisitNaryOperator(const NaryOperatorNode* node) {
@@ -662,6 +784,12 @@ void QueryVisitor::VisitNaryOperator(const NaryOperatorNode* node) {
pending_error_ = std::move(status);
}
return;
+ } else if (IsNumericComparator(node->operator_text())) {
+ libtextclassifier3::Status status = ProcessNumericComparator(node);
+ if (!status.ok()) {
+ pending_error_ = std::move(status);
+ }
+ return;
}
// 1. Put in a placeholder PendingValue
@@ -677,9 +805,7 @@ void QueryVisitor::VisitNaryOperator(const NaryOperatorNode* node) {
// 3. Retrieve the pending value for this node.
libtextclassifier3::StatusOr<PendingValue> pending_value_or;
- if (IsNumericComparator(node->operator_text())) {
- pending_value_or = ProcessNumericComparator(node);
- } else if (node->operator_text() == "AND") {
+ if (node->operator_text() == "AND") {
pending_value_or = ProcessAndOperator(node);
} else if (node->operator_text() == "OR") {
pending_value_or = ProcessOrOperator(node);
diff --git a/icing/query/advanced_query_parser/query-visitor.h b/icing/query/advanced_query_parser/query-visitor.h
index b4e3dd7..7498457 100644
--- a/icing/query/advanced_query_parser/query-visitor.h
+++ b/icing/query/advanced_query_parser/query-visitor.h
@@ -119,19 +119,21 @@ class QueryVisitor : public AbstractSyntaxTreeVisitor {
match_type_(match_type),
needs_term_frequency_info_(needs_term_frequency_info),
pending_property_restricts_(std::move(pending_property_restricts)),
- processing_not_(processing_not) {
+ processing_not_(processing_not),
+ expecting_numeric_arg_(false) {
RegisterFunctions();
}
bool has_pending_error() const { return !pending_error_.ok(); }
- // Creates a DocHitInfoIterator reflecting the provided term. Also populates,
+ // Creates a DocHitInfoIterator reflecting the provided term and whether the
+ // prefix operator has been applied to this term. Also populates,
// property_query_terms_map_ and query_term_iterators_ as appropriate.
// Returns:
// - On success, a DocHitInfoIterator for the provided term
// - INVALID_ARGUMENT if unable to create an iterator for the term.
libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
- CreateTermIterator(const std::string& term);
+ CreateTermIterator(QueryTerm term);
// Processes the PendingValue at the top of pending_values_, parses it into a
// int64_t and pops the top.
@@ -143,15 +145,17 @@ class QueryVisitor : public AbstractSyntaxTreeVisitor {
// Processes the PendingValue at the top of pending_values_ and pops the top.
// Returns:
- // - On success, the string value stored in the text at the top
+ // - On success, the string value stored in the text at the top and a bool
+ // indicating whether or not the string value has a prefix operator.
// - INVALID_ARGUMENT if pending_values_ is empty or doesn't hold a string.
- libtextclassifier3::StatusOr<std::string> PopPendingStringValue();
+ libtextclassifier3::StatusOr<QueryTerm> PopPendingStringValue();
// Processes the PendingValue at the top of pending_values_ and pops the top.
// Returns:
// - On success, the string value stored in the text at the top
+ // indicating whether or not the string value has a prefix operator.
// - INVALID_ARGUMENT if pending_values_ is empty or doesn't hold a text.
- libtextclassifier3::StatusOr<std::string> PopPendingTextValue();
+ libtextclassifier3::StatusOr<QueryTerm> PopPendingTextValue();
// Processes the PendingValue at the top of pending_values_ and pops the top.
// Returns:
@@ -171,15 +175,35 @@ class QueryVisitor : public AbstractSyntaxTreeVisitor {
libtextclassifier3::StatusOr<std::vector<std::unique_ptr<DocHitInfoIterator>>>
PopAllPendingIterators();
+ // Processes the unary operator node as a NOT operator. A NOT can have an
+ // operator type of "NOT" or "MINUS"
+ //
+ // RETURNS:
+ // - OK on success
+ // - INVALID_ARGUMENT if any errors are encountered while processing
+ // node->child
+ libtextclassifier3::Status ProcessNotOperator(const UnaryOperatorNode* node);
+
+ // Processes the unary operator node as a negation operator. A negation
+ // operator should have an operator of type "MINUS" and it's children must
+ // resolve to a numeric value.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INVALID_ARGUMENT if the node->child can't be resolved to a numeric
+ // value.
+ libtextclassifier3::Status ProcessNegationOperator(
+ const UnaryOperatorNode* node);
+
// Processes the NumericComparator represented by node. This must be called
// *after* this node's children have been visited. The PendingValues added by
// this node's children will be consumed by this function and the PendingValue
// for this node will be returned.
// Returns:
- // - On success, then PendingValue representing this node and it's children.
+ // - On success, OK
// - INVALID_ARGUMENT if unable to retrieve string value or int value
// - NOT_FOUND if there is no entry in the numeric index for the property
- libtextclassifier3::StatusOr<PendingValue> ProcessNumericComparator(
+ libtextclassifier3::Status ProcessNumericComparator(
const NaryOperatorNode* node);
// Processes the AND and OR operators represented by the node. This must be
@@ -230,6 +254,12 @@ class QueryVisitor : public AbstractSyntaxTreeVisitor {
// children cannot be resolved to a MEMBER or an iterator respectively.
libtextclassifier3::Status ProcessHasOperator(const NaryOperatorNode* node);
+ // Returns the correct match type to apply based on both the match type and
+ // whether the prefix operator is currently present.
+ TermMatchType::Code GetTermMatchType(bool is_prefix) const {
+ return (is_prefix) ? TermMatchType::PREFIX : match_type_;
+ }
+
std::stack<PendingValue> pending_values_;
libtextclassifier3::Status pending_error_;
@@ -259,6 +289,10 @@ class QueryVisitor : public AbstractSyntaxTreeVisitor {
// The stack of property restricts currently being processed by the visitor.
PendingPropertyRestricts pending_property_restricts_;
bool processing_not_;
+
+ // Whether we are in the midst of processing a subtree that is expected to
+ // resolve to a numeric argument.
+ bool expecting_numeric_arg_;
};
} // namespace lib
diff --git a/icing/query/advanced_query_parser/query-visitor_test.cc b/icing/query/advanced_query_parser/query-visitor_test.cc
index 12a6631..033e86b 100644
--- a/icing/query/advanced_query_parser/query-visitor_test.cc
+++ b/icing/query/advanced_query_parser/query-visitor_test.cc
@@ -389,8 +389,7 @@ TEST_P(QueryVisitorTest, SimpleGreaterThan) {
ElementsAre(kDocumentId2));
}
-// TODO(b/208654892) Properly handle negative numbers in query expressions.
-TEST_P(QueryVisitorTest, DISABLED_IntMinLessThanEqual) {
+TEST_P(QueryVisitorTest, IntMinLessThanEqual) {
// Setup the numeric index with docs 0, 1 and 2 holding the values INT_MIN,
// INT_MAX and INT_MIN + 1 respectively.
int64_t int_min = std::numeric_limits<int64_t>::min();
@@ -648,8 +647,7 @@ TEST_P(QueryVisitorTest, NeverVisitedReturnsInvalid) {
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-// TODO(b/208654892) Properly handle negative numbers in query expressions.
-TEST_P(QueryVisitorTest, DISABLED_IntMinLessThanInvalid) {
+TEST_P(QueryVisitorTest, IntMinLessThanInvalid) {
// Setup the numeric index with docs 0, 1 and 2 holding the values INT_MIN,
// INT_MAX and INT_MIN + 1 respectively.
int64_t int_min = std::numeric_limits<int64_t>::min();
@@ -724,6 +722,74 @@ TEST_P(QueryVisitorTest, NumericComparisonPropertyStringIsInvalid) {
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
+TEST_P(QueryVisitorTest, NumericComparatorDoesntAffectLaterTerms) {
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("type"))
+ .Build()));
+
+ // Index three documents:
+ // - Doc0: ["-2", "-1", "1", "2"] and [-2, -1, 1, 2]
+ // - Doc1: [-1]
+ // - Doc2: ["2"] and [-1]
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build()));
+ std::unique_ptr<NumericIndex<int64_t>::Editor> editor =
+ numeric_index_->Edit("price", kDocumentId0, kSectionId0);
+ editor->BufferKey(-2);
+ editor->BufferKey(-1);
+ editor->BufferKey(1);
+ editor->BufferKey(2);
+ std::move(*editor).IndexAllBufferedKeys();
+ Index::Editor term_editor = index_->Edit(
+ kDocumentId0, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ term_editor.BufferTerm("-2");
+ term_editor.BufferTerm("-1");
+ term_editor.BufferTerm("1");
+ term_editor.BufferTerm("2");
+ term_editor.IndexAllBufferedTerms();
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build()));
+ editor = numeric_index_->Edit("price", kDocumentId1, kSectionId0);
+ editor->BufferKey(-1);
+ std::move(*editor).IndexAllBufferedKeys();
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build()));
+ editor = numeric_index_->Edit("price", kDocumentId2, kSectionId0);
+ editor->BufferKey(-1);
+ std::move(*editor).IndexAllBufferedKeys();
+ term_editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ term_editor.BufferTerm("2");
+ term_editor.IndexAllBufferedTerms();
+
+ // Translating MINUS chars that are interpreted as NOTs, this query would be
+ // `price == -1 AND NOT 2`
+ // All documents should match `price == -1`
+ // Both docs 0 and 2 should be excluded because of the `NOT 2` clause
+ // doc0 has both a text and number entry for `-2`, neither of which should
+ // match.
+ std::string query = CreateQuery("price == -1 -2");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(),
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true);
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(query_results.features_in_use,
+ ElementsAre(kNumericSearchFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty());
+ EXPECT_THAT(query_results.query_terms, IsEmpty());
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId1));
+}
+
TEST_P(QueryVisitorTest, SingleTermTermFrequencyEnabled) {
// Setup the index with docs 0, 1 and 2 holding the values "foo", "foo" and
// "bar" respectively.
@@ -827,6 +893,169 @@ TEST_P(QueryVisitorTest, SingleTermTermFrequencyDisabled) {
StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
}
+TEST_P(QueryVisitorTest, SingleTermPrefix) {
+ // Setup the index with docs 0, 1 and 2 holding the values "foo", "foo" and
+ // "bar" respectively.
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ editor.BufferTerm("foo");
+ editor.IndexAllBufferedTerms();
+
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ editor.BufferTerm("foo");
+ editor.IndexAllBufferedTerms();
+
+ editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ editor.BufferTerm("bar");
+ editor.IndexAllBufferedTerms();
+
+ // An EXACT query for 'fo' won't match anything.
+ std::string query = CreateQuery("fo");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(),
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_EXACT,
+ /*needs_term_frequency_info_=*/true);
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("fo"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("fo"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), IsEmpty());
+
+ query = CreateQuery("fo*");
+ ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query));
+ QueryVisitor query_visitor_two(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(),
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_EXACT,
+ /*needs_term_frequency_info_=*/true);
+ root_node->Accept(&query_visitor_two);
+ ICING_ASSERT_OK_AND_ASSIGN(query_results,
+ std::move(query_visitor_two).ConsumeResults());
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("fo"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("fo"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId1, kDocumentId0));
+}
+
+TEST_P(QueryVisitorTest, PrefixOperatorAfterPropertyReturnsInvalid) {
+ std::string query = "price* < 2";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(),
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true);
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(QueryVisitorTest, PrefixOperatorAfterNumericValueReturnsInvalid) {
+ std::string query = "price < 2*";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(),
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true);
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(QueryVisitorTest, PrefixOperatorAfterPropertyRestrictReturnsInvalid) {
+ std::string query = "subject*:foo";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(),
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true);
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(QueryVisitorTest, SegmentationWithPrefix) {
+ // Setup the index with docs 0, 1 and 2 holding the values ["foo", "ba"],
+ // ["foo", "ba"] and ["bar", "fo"] respectively.
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ editor.BufferTerm("foo");
+ editor.BufferTerm("ba");
+ editor.IndexAllBufferedTerms();
+
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ editor.BufferTerm("foo");
+ editor.BufferTerm("ba");
+ editor.IndexAllBufferedTerms();
+
+ editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ editor.BufferTerm("bar");
+ editor.BufferTerm("fo");
+ editor.IndexAllBufferedTerms();
+
+ // An EXACT query for `ba?fo` will be lexed into a single TEXT token.
+ // The visitor will tokenize it into `ba` and `fo` (`?` is dropped because it
+ // is punctuation). Each document will match one and only one of these exact
+ // tokens. Therefore, nothing will match this query.
+ std::string query = CreateQuery("ba?fo");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(),
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_EXACT,
+ /*needs_term_frequency_info_=*/true);
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("ba", "fo"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("ba", "fo"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), IsEmpty());
+
+ // An EXACT query for `ba?fo*` will be lexed into a TEXT token and a TIMES
+ // token.
+ // The visitor will tokenize the TEXT into `ba` and `fo` (`?` is dropped
+ // because it is punctuation). The prefix operator should only apply to the
+ // final token `fo`. This will cause matches with docs 0 and 1 which contain
+ // "ba" and "foo". doc2 will not match because "ba" does not exactly match
+ // either "bar" or "fo".
+ query = CreateQuery("ba?fo*");
+ ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query));
+ QueryVisitor query_visitor_two(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(),
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_EXACT,
+ /*needs_term_frequency_info_=*/true);
+ root_node->Accept(&query_visitor_two);
+ ICING_ASSERT_OK_AND_ASSIGN(query_results,
+ std::move(query_visitor_two).ConsumeResults());
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("ba", "fo"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("ba", "fo"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId1, kDocumentId0));
+}
+
TEST_P(QueryVisitorTest, SingleVerbatimTerm) {
// Setup the index with docs 0, 1 and 2 holding the values "foo:bar(baz)",
// "foo:bar(baz)" and "bar:baz(foo)" respectively.
@@ -867,6 +1096,46 @@ TEST_P(QueryVisitorTest, SingleVerbatimTerm) {
ElementsAre(kDocumentId1, kDocumentId0));
}
+TEST_P(QueryVisitorTest, SingleVerbatimTermPrefix) {
+ // Setup the index with docs 0, 1 and 2 holding the values "foo:bar(baz)",
+ // "foo:bar(abc)" and "bar:baz(foo)" respectively.
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ editor.BufferTerm("foo:bar(baz)");
+ editor.IndexAllBufferedTerms();
+
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ editor.BufferTerm("foo:bar(abc)");
+ editor.IndexAllBufferedTerms();
+
+ editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ editor.BufferTerm("bar:baz(foo)");
+ editor.IndexAllBufferedTerms();
+
+ // Query for `"foo:bar("*`. This should match docs 0 and 1.
+ std::string query = CreateQuery("\"foo:bar(\"*");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(),
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_EXACT,
+ /*needs_term_frequency_info_=*/true);
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(query_results.features_in_use,
+ ElementsAre(kVerbatimSearchFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("foo:bar("));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo:bar("));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId1, kDocumentId0));
+}
+
// There are three primary cases to worry about for escaping:
//
// NOTE: The following comments use ` chars to denote the beginning and end of
diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc
index 2a7e108..35ee172 100644
--- a/icing/store/document-store.cc
+++ b/icing/store/document-store.cc
@@ -58,6 +58,7 @@
#include "icing/util/clock.h"
#include "icing/util/crc32.h"
#include "icing/util/data-loss.h"
+#include "icing/util/encode-util.h"
#include "icing/util/fingerprint-util.h"
#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
@@ -88,6 +89,17 @@ constexpr int32_t kUriMapperMaxSize = 36 * 1024 * 1024; // 36 MiB
constexpr int32_t kNamespaceMapperMaxSize = 3 * 128 * 1024; // 384 KiB
constexpr int32_t kCorpusMapperMaxSize = 3 * 128 * 1024; // 384 KiB
+// Whether to use namespace id or namespace name to build up fingerprint for
+// document_key_mapper_ and corpus_mapper_.
+// Note: Changing this flag will require a reconstruction of the internal
+// mappers in the document store. A easy way to trigger a rebuild is to change
+// the kMagic value.
+//
+// TODO(b/259969017) Flip this flag to true at the time when we switch to use
+// persistent hash map for document_key_mapper_ so that we just need one
+// reconstruction of the internal mappers.
+constexpr bool kNamespaceIdFingerprint = false;
+
DocumentWrapper CreateDocumentWrapper(DocumentProto&& document) {
DocumentWrapper document_wrapper;
*document_wrapper.mutable_document() = std::move(document);
@@ -126,18 +138,40 @@ std::string MakeCorpusMapperFilename(const std::string& base_dir) {
return absl_ports::StrCat(base_dir, "/", kCorpusIdMapperFilename);
}
-// TODO(adorokhine): This class internally uses an 8-byte fingerprint of the
-// Key and stores the key/value in a file-backed-trie that adds an ~80 byte
-// overhead per key. As we know that these fingerprints are always 8-bytes in
-// length and that they're random, we might be able to store them more
-// compactly.
-std::string MakeFingerprint(std::string_view field1, std::string_view field2) {
- // Using a 64-bit fingerprint to represent the key could lead to collisions.
- // But, even with 200K unique keys, the probability of collision is about
- // one-in-a-billion (https://en.wikipedia.org/wiki/Birthday_attack).
- uint64_t fprint =
- tc3farmhash::Fingerprint64(absl_ports::StrCat(field1, field2));
- return fingerprint_util::GetFingerprintString(fprint);
+// This function will encode a namespace id into a fixed 3 bytes string.
+std::string EncodeNamespaceId(NamespaceId namespace_id) {
+ // encoding should be 1 to 3 bytes based on the value of namespace_id.
+ std::string encoding = encode_util::EncodeIntToCString(namespace_id);
+ // Make encoding to fixed 3 bytes.
+ while (encoding.size() < 3) {
+ // DynamicTrie cannot handle keys with 0 as bytes, so we append it using 1,
+ // just like what we do in encode_util::EncodeIntToCString.
+ //
+ // The reason that this works is because DecodeIntToString decodes a byte
+ // value of 0x01 as 0x00. When EncodeIntToCString returns a namespaceid
+ // encoding that is less than 3 bytes, it means that the id contains
+ // unencoded leading 0x00. So here we're explicitly encoding those bytes as
+ // 0x01.
+ encoding.push_back(1);
+ }
+ return encoding;
+}
+
+std::string MakeFingerprint(NamespaceId namespace_id,
+ std::string_view namespace_,
+ std::string_view uri_or_schema) {
+ if (!kNamespaceIdFingerprint) {
+ // Using a 64-bit fingerprint to represent the key could lead to collisions.
+ // But, even with 200K unique keys, the probability of collision is about
+ // one-in-a-billion (https://en.wikipedia.org/wiki/Birthday_attack).
+ uint64_t fprint = tc3farmhash::Fingerprint64(
+ absl_ports::StrCat(namespace_, uri_or_schema));
+ return fingerprint_util::GetFingerprintString(fprint);
+ } else {
+ return absl_ports::StrCat(EncodeNamespaceId(namespace_id),
+ encode_util::EncodeIntToCString(
+ tc3farmhash::Fingerprint64(uri_or_schema)));
+ }
}
int64_t CalculateExpirationTimestampMs(int64_t creation_timestamp_ms,
@@ -489,10 +523,16 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles(
continue;
}
}
+
+ ICING_ASSIGN_OR_RETURN(
+ NamespaceId namespace_id,
+ namespace_mapper_->GetOrPut(document_wrapper.document().namespace_(),
+ namespace_mapper_->num_keys()));
+
// Updates key mapper and document_id mapper with the new document
DocumentId new_document_id = document_id_mapper_->num_elements();
ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
- MakeFingerprint(document_wrapper.document().namespace_(),
+ MakeFingerprint(namespace_id, document_wrapper.document().namespace_(),
document_wrapper.document().uri()),
new_document_id));
ICING_RETURN_IF_ERROR(
@@ -517,14 +557,9 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles(
schema_type_id = schema_type_id_or.ValueOrDie();
}
- ICING_ASSIGN_OR_RETURN(
- NamespaceId namespace_id,
- namespace_mapper_->GetOrPut(document_wrapper.document().namespace_(),
- namespace_mapper_->num_keys()));
-
// Update corpus maps
std::string corpus =
- MakeFingerprint(document_wrapper.document().namespace_(),
+ MakeFingerprint(namespace_id, document_wrapper.document().namespace_(),
document_wrapper.document().schema());
ICING_ASSIGN_OR_RETURN(
CorpusId corpusId,
@@ -892,20 +927,21 @@ libtextclassifier3::StatusOr<DocumentId> DocumentStore::InternalPut(
"some space.");
}
- ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
- MakeFingerprint(name_space, uri), new_document_id));
- ICING_RETURN_IF_ERROR(document_id_mapper_->Set(new_document_id, file_offset));
-
// Update namespace maps
ICING_ASSIGN_OR_RETURN(
NamespaceId namespace_id,
namespace_mapper_->GetOrPut(name_space, namespace_mapper_->num_keys()));
+ // Updates key mapper and document_id mapper
+ ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
+ MakeFingerprint(namespace_id, name_space, uri), new_document_id));
+ ICING_RETURN_IF_ERROR(document_id_mapper_->Set(new_document_id, file_offset));
+
// Update corpus maps
- ICING_ASSIGN_OR_RETURN(
- CorpusId corpusId,
- corpus_mapper_->GetOrPut(MakeFingerprint(name_space, schema),
- corpus_mapper_->num_keys()));
+ ICING_ASSIGN_OR_RETURN(CorpusId corpusId,
+ corpus_mapper_->GetOrPut(
+ MakeFingerprint(namespace_id, name_space, schema),
+ corpus_mapper_->num_keys()));
ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
GetCorpusAssociatedScoreDataToUpdate(corpusId));
@@ -1023,17 +1059,21 @@ libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
libtextclassifier3::StatusOr<DocumentId> DocumentStore::GetDocumentId(
const std::string_view name_space, const std::string_view uri) const {
- auto document_id_or =
- document_key_mapper_->Get(MakeFingerprint(name_space, uri));
- if (!document_id_or.ok()) {
- return absl_ports::Annotate(
- document_id_or.status(),
- absl_ports::StrCat("Failed to find DocumentId by key: ", name_space,
- ", ", uri));
+ auto namespace_id_or = namespace_mapper_->Get(name_space);
+ libtextclassifier3::Status status = namespace_id_or.status();
+ if (status.ok()) {
+ NamespaceId namespace_id = namespace_id_or.ValueOrDie();
+ auto document_id_or = document_key_mapper_->Get(
+ MakeFingerprint(namespace_id, name_space, uri));
+ status = document_id_or.status();
+ if (status.ok()) {
+ // Guaranteed to have a DocumentId now
+ return document_id_or.ValueOrDie();
+ }
}
-
- // Guaranteed to have a DocumentId now
- return document_id_or.ValueOrDie();
+ return absl_ports::Annotate(
+ status, absl_ports::StrCat(
+ "Failed to find DocumentId by key: ", name_space, ", ", uri));
}
std::vector<std::string> DocumentStore::GetAllNamespaces() const {
@@ -1158,7 +1198,9 @@ libtextclassifier3::StatusOr<NamespaceId> DocumentStore::GetNamespaceId(
libtextclassifier3::StatusOr<CorpusId> DocumentStore::GetCorpusId(
const std::string_view name_space, const std::string_view schema) const {
- return corpus_mapper_->Get(MakeFingerprint(name_space, schema));
+ ICING_ASSIGN_OR_RETURN(NamespaceId namespace_id,
+ namespace_mapper_->Get(name_space));
+ return corpus_mapper_->Get(MakeFingerprint(namespace_id, name_space, schema));
}
libtextclassifier3::StatusOr<int32_t> DocumentStore::GetResultGroupingEntryId(
diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt
index 2dc1352..5838a7b 100644
--- a/synced_AOSP_CL_number.txt
+++ b/synced_AOSP_CL_number.txt
@@ -1 +1 @@
-set(synced_AOSP_CL_number=513153289)
+set(synced_AOSP_CL_number=513864120)