aboutsummaryrefslogtreecommitdiff
path: root/icing/index/main/main-index-merger.cc
blob: cc130c265836f7fb221ba8d077c60580dd29dd63 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
// Copyright (C) 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "icing/index/main/main-index-merger.h"

#include <algorithm>
#include <cstdint>
#include <cstring>
#include <unordered_map>
#include <utility>
#include <vector>

#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/file/posting_list/posting-list-common.h"
#include "icing/index/hit/hit.h"
#include "icing/index/lite/lite-index.h"
#include "icing/index/lite/term-id-hit-pair.h"
#include "icing/index/main/main-index.h"
#include "icing/index/term-id-codec.h"
#include "icing/legacy/core/icing-string-util.h"
#include "icing/util/logging.h"
#include "icing/util/status-macros.h"

namespace icing {
namespace lib {

namespace {

class HitSelector {
 public:
  // Returns whether or not term_id_hit_pair has the same term_id, document_id
  // and section_id as the previously selected hits.
  bool IsEquivalentHit(const TermIdHitPair& term_id_hit_pair) {
    return prev_.term_id() == term_id_hit_pair.term_id() &&
           prev_.hit().document_id() == term_id_hit_pair.hit().document_id() &&
           prev_.hit().section_id() == term_id_hit_pair.hit().section_id();
  }

  // Merges term_id_hit_pair with previously added hits.
  void SelectIfBetter(const TermIdHitPair& term_id_hit_pair) {
    if (term_id_hit_pair.hit().is_prefix_hit()) {
      SelectPrefixHitIfBetter(term_id_hit_pair);
    } else {
      SelectExactHitIfBetter(term_id_hit_pair);
    }
    prev_ = term_id_hit_pair;
  }

  // Adds all valid, selected hits to hits starting at position pos in hits.
  // Returns the offset in hits after the position of the last added hit.
  // This function may add between 0-2 hits depending on whether the HitSelector
  // holds both a valid exact hit and a valid prefix hit, one of those or none.
  size_t InsertSelectedHits(size_t pos, std::vector<TermIdHitPair>* hits) {
    // Given the prefix/exact hits for a given term+docid+sectionid, push needed
    // hits into hits array at offset pos. Return new pos.
    if (best_prefix_hit_.hit().is_valid() && best_exact_hit_.hit().is_valid()) {
      (*hits)[pos++] = best_exact_hit_;
      const Hit& prefix_hit = best_prefix_hit_.hit();
      // The prefix hit has score equal to the sum of the scores, capped at
      // kMaxTermFrequency.
      Hit::TermFrequency final_term_frequency = std::min(
          static_cast<int>(Hit::kMaxTermFrequency),
          prefix_hit.term_frequency() + best_exact_hit_.hit().term_frequency());
      best_prefix_hit_ = TermIdHitPair(
          best_prefix_hit_.term_id(),
          Hit(prefix_hit.section_id(), prefix_hit.document_id(),
              final_term_frequency, prefix_hit.is_in_prefix_section(),
              prefix_hit.is_prefix_hit()));
      (*hits)[pos++] = best_prefix_hit_;
      // Ensure sorted.
      if (best_prefix_hit_.hit() < best_exact_hit_.hit()) {
        std::swap((*hits)[pos - 1], (*hits)[pos - 2]);
      }
    } else if (best_prefix_hit_.hit().is_valid()) {
      (*hits)[pos++] = best_prefix_hit_;
    } else if (best_exact_hit_.hit().is_valid()) {
      (*hits)[pos++] = best_exact_hit_;
    }

    return pos;
  }

  void Reset() {
    best_prefix_hit_ = TermIdHitPair();
    best_exact_hit_ = TermIdHitPair();
    prev_ = TermIdHitPair();
  }

 private:
  void SelectPrefixHitIfBetter(const TermIdHitPair& term_id_hit_pair) {
    if (!best_prefix_hit_.hit().is_valid()) {
      best_prefix_hit_ = term_id_hit_pair;
    } else {
      const Hit& hit = term_id_hit_pair.hit();
      // Create a new prefix hit with term_frequency as the sum of the term
      // frequencies. The term frequency is capped at kMaxTermFrequency.
      Hit::TermFrequency final_term_frequency = std::min(
          static_cast<int>(Hit::kMaxTermFrequency),
          hit.term_frequency() + best_prefix_hit_.hit().term_frequency());
      best_prefix_hit_ = TermIdHitPair(
          term_id_hit_pair.term_id(),
          Hit(hit.section_id(), hit.document_id(), final_term_frequency,
              best_prefix_hit_.hit().is_in_prefix_section(),
              best_prefix_hit_.hit().is_prefix_hit()));
    }
  }

  void SelectExactHitIfBetter(const TermIdHitPair& term_id_hit_pair) {
    if (!best_exact_hit_.hit().is_valid()) {
      best_exact_hit_ = term_id_hit_pair;
    } else {
      const Hit& hit = term_id_hit_pair.hit();
      // Create a new exact hit with term_frequency as the sum of the term
      // frequencies. The term frequency is capped at kMaxHitScore.
      Hit::TermFrequency final_term_frequency = std::min(
          static_cast<int>(Hit::kMaxTermFrequency),
          hit.term_frequency() + best_exact_hit_.hit().term_frequency());
      best_exact_hit_ = TermIdHitPair(
          term_id_hit_pair.term_id(),
          Hit(hit.section_id(), hit.document_id(), final_term_frequency,
              best_exact_hit_.hit().is_in_prefix_section(),
              best_exact_hit_.hit().is_prefix_hit()));
    }
  }

  TermIdHitPair best_prefix_hit_;
  TermIdHitPair best_exact_hit_;
  TermIdHitPair prev_;
};

class HitComparator {
 public:
  explicit HitComparator(
      const TermIdCodec& term_id_codec,
      const std::unordered_map<uint32_t, int>& main_tvi_to_block_index)
      : term_id_codec_(&term_id_codec),
        main_tvi_to_block_index_(&main_tvi_to_block_index) {}

  bool operator()(const TermIdHitPair& lhs, const TermIdHitPair& rhs) const {
    // Primary sort by index block. This acheives two things:
    // 1. It reduces the number of flash writes by grouping together new hits
    // for terms whose posting lists might share the same index block.
    // 2. More importantly, this ensures that newly added backfill branch points
    // will be populated first (because all newly added terms have an invalid
    // block index of 0) before any new hits are added to the postings lists
    // that they backfill from.
    int lhs_index_block = GetIndexBlock(lhs.term_id());
    int rhs_index_block = GetIndexBlock(rhs.term_id());
    if (lhs_index_block == rhs_index_block) {
      // Secondary sort by term_id and hit.
      return lhs.value() < rhs.value();
    }
    return lhs_index_block < rhs_index_block;
  }

 private:
  int GetIndexBlock(uint32_t term_id) const {
    auto term_info_or = term_id_codec_->DecodeTermInfo(term_id);
    if (!term_info_or.ok()) {
      ICING_LOG(WARNING)
          << "Unable to decode term-info during merge. This shouldn't happen.";
      return kInvalidBlockIndex;
    }
    TermIdCodec::DecodedTermInfo term_info =
        std::move(term_info_or).ValueOrDie();
    auto itr = main_tvi_to_block_index_->find(term_info.tvi);
    if (itr == main_tvi_to_block_index_->end()) {
      return kInvalidBlockIndex;
    }
    return itr->second;
  }

  const TermIdCodec* term_id_codec_;
  const std::unordered_map<uint32_t, int>* main_tvi_to_block_index_;
};

// A helper function to dedupe hits stored in hits. Suppose that the lite index
// contained a single document with two hits in a single prefix section: "foot"
// and "fool". When expanded, there would be four hits:
// {"fo", docid0, sectionid0}
// {"fo", docid0, sectionid0}
// {"foot", docid0, sectionid0}
// {"fool", docid0, sectionid0}
//
// The first two are duplicates of each other. So, this function will dedupe
// and shrink hits to be:
// {"fo", docid0, sectionid0}
// {"foot", docid0, sectionid0}
// {"fool", docid0, sectionid0}
//
// When two or more prefix hits are duplicates, merge into one hit with term
// frequency as the sum of the term frequencies. If there is both an exact and
// prefix hit for the same term, keep the exact hit as it is, update the prefix
// hit so that its term frequency is the sum of the term frequencies.
void DedupeHits(
    std::vector<TermIdHitPair>* hits, const TermIdCodec& term_id_codec,
    const std::unordered_map<uint32_t, int>& main_tvi_to_block_index) {
  // Now all terms are grouped together and all hits for a term are sorted.
  // Merge equivalent hits into one.
  std::sort(hits->begin(), hits->end(),
            HitComparator(term_id_codec, main_tvi_to_block_index));
  size_t current_offset = 0;
  HitSelector hit_selector;
  for (const TermIdHitPair& term_id_hit_pair : *hits) {
    if (!hit_selector.IsEquivalentHit(term_id_hit_pair)) {
      // We've reached a new hit. Insert the previously selected hits that we
      // had accumulated and reset to add this new hit.
      current_offset = hit_selector.InsertSelectedHits(current_offset, hits);
      hit_selector.Reset();
    }
    // Update best exact and prefix hit.
    hit_selector.SelectIfBetter(term_id_hit_pair);
  }

  // Push last.
  current_offset = hit_selector.InsertSelectedHits(current_offset, hits);

  hits->resize(current_offset);
}

// Based on experiments with full prefix expansion, the multiplier
// is ~4x.
constexpr int kAvgPrefixesPerTerm = 4;

}  // namespace

libtextclassifier3::StatusOr<std::vector<TermIdHitPair>>
MainIndexMerger::TranslateAndExpandLiteHits(
    const LiteIndex& lite_index, const TermIdCodec& term_id_codec,
    const MainIndex::LexiconMergeOutputs& lexicon_merge_outputs) {
  std::vector<TermIdHitPair> hits;
  if (lite_index.empty()) {
    return hits;
  }
  // Reserve enough space for the average number of prefixes per term and the
  // terms themselves.
  hits.reserve(lite_index.size() * (kAvgPrefixesPerTerm + 1));

  // Translate lite tvis to main tvis.
  for (const TermIdHitPair& term_id_hit_pair : lite_index) {
    uint32_t cur_term_id = term_id_hit_pair.term_id();
    ICING_ASSIGN_OR_RETURN(TermIdCodec::DecodedTermInfo cur_decoded_term,
                           term_id_codec.DecodeTermInfo(cur_term_id));
    Hit hit(term_id_hit_pair.hit());

    // 1. Translate and push original.
    auto itr =
        lexicon_merge_outputs.other_tvi_to_main_tvi.find(cur_decoded_term.tvi);
    if (itr == lexicon_merge_outputs.other_tvi_to_main_tvi.cend()) {
      // b/37273773
      return absl_ports::InternalError(IcingStringUtil::StringPrintf(
          "Trying to translate lite tvi %u that was never added to the lexicon",
          cur_decoded_term.tvi));
    }
    ICING_ASSIGN_OR_RETURN(uint32_t term_id,
                           term_id_codec.EncodeTvi(itr->second, TviType::MAIN));
    hits.emplace_back(term_id, hit);

    // 2. Expand hits in prefix sections.
    if (hit.is_in_prefix_section()) {
      // Hit was in a prefix section. Push prefixes. Turn on prefix bit.
      auto itr_prefixes =
          lexicon_merge_outputs.other_tvi_to_prefix_main_tvis.find(
              cur_decoded_term.tvi);
      if (itr_prefixes ==
          lexicon_merge_outputs.other_tvi_to_prefix_main_tvis.end()) {
        ICING_VLOG(1) << "No necessary prefix expansion for "
                      << cur_decoded_term.tvi;
        continue;
      }
      // The tvis of all prefixes of this hit's term that appear in the main
      // lexicon are between [prefix_tvis_buf[offset],
      // prefix_tvis_buf[offset+len]).
      size_t offset = itr_prefixes->second.first;
      size_t len = itr_prefixes->second.second;
      size_t offset_end_exclusive = offset + len;
      Hit prefix_hit(hit.section_id(), hit.document_id(), hit.term_frequency(),
                     /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);
      for (; offset < offset_end_exclusive; ++offset) {
        // Take the tvi (in the main lexicon) of each prefix term.
        uint32_t prefix_main_tvi =
            lexicon_merge_outputs.prefix_tvis_buf[offset];
        // Convert it to a term_id.
        ICING_ASSIGN_OR_RETURN(
            uint32_t prefix_term_id,
            term_id_codec.EncodeTvi(prefix_main_tvi, TviType::MAIN));
        // Create add an element for this prefix TermId and prefix Hit to hits.
        hits.emplace_back(prefix_term_id, prefix_hit);
      }
    }
  }
  // 3. Remove any duplicate hits.
  DedupeHits(&hits, term_id_codec,
             lexicon_merge_outputs.main_tvi_to_block_index);
  return hits;
}

}  // namespace lib
}  // namespace icing