aboutsummaryrefslogtreecommitdiff
path: root/icing/index/main/posting-list-hit-serializer.h
blob: 2986d9c64ee1257a5e4a3f59387d70a553083ddd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
// Copyright (C) 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef ICING_INDEX_MAIN_POSTING_LIST_HIT_SERIALIZER_H_
#define ICING_INDEX_MAIN_POSTING_LIST_HIT_SERIALIZER_H_

#include <cstdint>
#include <vector>

#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/file/posting_list/posting-list-common.h"
#include "icing/file/posting_list/posting-list-used.h"
#include "icing/index/hit/hit.h"
#include "icing/util/status-macros.h"

namespace icing {
namespace lib {

// A serializer class to serialize hits to PostingListUsed. Layout described in
// comments in posting-list-hit-serializer.cc.
class PostingListHitSerializer : public PostingListSerializer {
 public:
  static constexpr uint32_t kSpecialHitsSize = kNumSpecialData * sizeof(Hit);

  uint32_t GetDataTypeBytes() const override { return sizeof(Hit); }

  uint32_t GetMinPostingListSize() const override {
    static constexpr uint32_t kMinPostingListSize = kSpecialHitsSize;
    static_assert(sizeof(PostingListIndex) <= kMinPostingListSize,
                  "PostingListIndex must be small enough to fit in a "
                  "minimum-sized Posting List.");

    return kMinPostingListSize;
  }

  uint32_t GetMinPostingListSizeToFit(
      const PostingListUsed* posting_list_used) const override;

  uint32_t GetBytesUsed(
      const PostingListUsed* posting_list_used) const override;

  void Clear(PostingListUsed* posting_list_used) const override;

  libtextclassifier3::Status MoveFrom(PostingListUsed* dst,
                                      PostingListUsed* src) const override;

  // Prepend a hit to the posting list.
  //
  // RETURNS:
  //   - INVALID_ARGUMENT if !hit.is_valid() or if hit is not less than the
  //       previously added hit.
  //   - RESOURCE_EXHAUSTED if there is no more room to add hit to the posting
  //       list.
  libtextclassifier3::Status PrependHit(PostingListUsed* posting_list_used,
                                        const Hit& hit) const;

  // Prepend hits to the posting list. Hits should be sorted in descending order
  // (as defined by the less than operator for Hit)
  //
  // Returns the number of hits that could be prepended to the posting list. If
  // keep_prepended is true, whatever could be prepended is kept, otherwise the
  // posting list is left in its original state.
  template <class T, Hit (*GetHit)(const T&)>
  libtextclassifier3::StatusOr<uint32_t> PrependHitArray(
      PostingListUsed* posting_list_used, const T* array, uint32_t num_hits,
      bool keep_prepended) const;

  // Retrieves the hits stored in the posting list.
  //
  // RETURNS:
  //   - On success, a vector of hits sorted by the reverse order of prepending.
  //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
  libtextclassifier3::StatusOr<std::vector<Hit>> GetHits(
      const PostingListUsed* posting_list_used) const;

  // Same as GetHits but appends hits to hits_out.
  //
  // RETURNS:
  //   - On success, a vector of hits sorted by the reverse order of prepending.
  //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
  libtextclassifier3::Status GetHits(const PostingListUsed* posting_list_used,
                                     std::vector<Hit>* hits_out) const;

  // Undo the last num_hits hits prepended. If num_hits > number of
  // hits we clear all hits.
  //
  // RETURNS:
  //   - OK on success
  //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
  libtextclassifier3::Status PopFrontHits(PostingListUsed* posting_list_used,
                                          uint32_t num_hits) const;

 private:
  // Posting list layout formats:
  //
  // not_full
  //
  // +-----------------+----------------+-------+-----------------+
  // |hits-start-offset|Hit::kInvalidVal|xxxxxxx|(compressed) hits|
  // +-----------------+----------------+-------+-----------------+
  //
  // almost_full
  //
  // +-----------------+----------------+-------+-----------------+
  // |Hit::kInvalidVal |1st hit         |(pad)  |(compressed) hits|
  // +-----------------+----------------+-------+-----------------+
  //
  // full()
  //
  // +-----------------+----------------+-------+-----------------+
  // |1st hit          |2nd hit         |(pad)  |(compressed) hits|
  // +-----------------+----------------+-------+-----------------+
  //
  // The first two uncompressed hits also implicitly encode information about
  // the size of the compressed hits region.
  //
  // 1. If the posting list is NOT_FULL, then
  // posting_list_buffer_[0] contains the byte offset of the start of the
  // compressed hits - and, thus, the size of the compressed hits region is
  // size_in_bytes - posting_list_buffer_[0].
  //
  // 2. If posting list is ALMOST_FULL or FULL, then the compressed hits region
  // starts somewhere between [kSpecialHitsSize, kSpecialHitsSize + sizeof(Hit)
  // - 1] and ends at size_in_bytes - 1.
  //
  // Hit term frequencies are stored after the hit value, compressed or
  // uncompressed. For the first two special hits, we always have a
  // space for the term frequency. For hits in the compressed area, we only have
  // the term frequency following the hit value of hit.has_term_frequency() is
  // true. This allows good compression in the common case where hits don't have
  // a valid term frequency.
  //
  // EXAMPLE
  // Posting list storage. Posting list size: 20 bytes
  // EMPTY!
  // +--bytes 0-4--+----- 5-9 ------+---------------- 10-19 -----------------+
  // |     20      |Hit::kInvalidVal|                 0x000                  |
  // +-------------+----------------+----------------+-----------------------+
  //
  // Add Hit 0x07FFF998 (DocumentId = 12, SectionId = 3, Flags = 0)
  // NOT FULL!
  // +--bytes 0-4--+----- 5-9 ------+----- 10-15 -----+-------- 16-19 -------+
  // |     16      |Hit::kInvalidVal|      0x000      |       0x07FFF998     |
  // +-------------+----------------+-----------------+----------------------+
  //
  // Add Hit 0x07FFF684 (DocumentId = 18, SectionId = 0, Flags = 4,
  // TermFrequency=125)
  // (Hit 0x07FFF998 - Hit 0x07FFF684 = 788)
  // +--bytes 0-4--+----- 5-9 ------+-- 10-12 --+-- 13-16 --+- 17 -+-- 18-19 --+
  // |      13     |Hit::kInvalidVal|   0x000   | 0x07FFF684| 125  |    788    |
  // +-------------+----------------+-----------+-----------+------+-----------+
  //
  // Add Hit 0x07FFF4D2 (DocumentId = 22, SectionId = 10, Flags = 2)
  // (Hit 0x07FFF684 - Hit 0x07FFF4D2 = 434)
  // +--bytes 0-4--+--- 5-9 ----+-- 10 --+-- 11-14 -+- 15-16 -+- 17 -+- 18-19 -+
  // |      9      |Hit::kInvVal|  0x00  |0x07FFF4D2|   434   | 125  |   788   |
  // +-------------+------------+--------+----------+---------+------+---------+
  //
  // Add Hit 0x07FFF40E (DocumentId = 23, SectionId = 1, Flags = 6,
  // TermFrequency = 87)
  // (Hit 0x07FFF684 - Hit 0x07FFF4D2 = 196) ALMOST FULL!
  // +--bytes 0-4-+---- 5-9 ----+- 10-12 -+- 13-14 -+- 15-16 -+- 17 -+- 18-19 -+
  // |Hit::kInvVal|0x07FFF40E,87|  0x000  |    196  |   434   |  125 |   788   |
  // +-------------+------------+---------+---------+---------+------+---------+
  //
  // Add Hit 0x07FFF320 (DocumentId = 27, SectionId = 4, Flags = 0)
  // FULL!
  // +--bytes 0-4--+---- 5-9 ----+- 10-13 -+-- 14-15 -+- 16-17 -+- 18 -+- 19-20
  // -+ | 0x07FFF320  |0x07FFF40E,87|  0x000  |    196   |   434   |  125 | 788
  // |
  // +-------------+-------------+---------+----------+---------+------+---------+

  // Helpers to determine what state the posting list is in.
  bool IsFull(const PostingListUsed* posting_list_used) const {
    return GetSpecialHit(posting_list_used, /*index=*/0)
               .ValueOrDie()
               .is_valid() &&
           GetSpecialHit(posting_list_used, /*index=*/1)
               .ValueOrDie()
               .is_valid();
  }

  bool IsAlmostFull(const PostingListUsed* posting_list_used) const {
    return !GetSpecialHit(posting_list_used, /*index=*/0)
                .ValueOrDie()
                .is_valid();
  }

  bool IsEmpty(const PostingListUsed* posting_list_used) const {
    return GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() ==
               posting_list_used->size_in_bytes() &&
           !GetSpecialHit(posting_list_used, /*index=*/1)
                .ValueOrDie()
                .is_valid();
  }

  // Returns false if both special hits are invalid or if the offset value
  // stored in the special hit is less than kSpecialHitsSize or greater than
  // posting_list_used->size_in_bytes(). Returns true, otherwise.
  bool IsPostingListValid(const PostingListUsed* posting_list_used) const;

  // Prepend hit to a posting list that is in the ALMOST_FULL state.
  // RETURNS:
  //  - OK, if successful
  //  - INVALID_ARGUMENT if hit is not less than the previously added hit.
  libtextclassifier3::Status PrependHitToAlmostFull(
      PostingListUsed* posting_list_used, const Hit& hit) const;

  // Prepend hit to a posting list that is in the EMPTY state. This will always
  // succeed because there are no pre-existing hits and no validly constructed
  // posting list could fail to fit one hit.
  void PrependHitToEmpty(PostingListUsed* posting_list_used,
                         const Hit& hit) const;

  // Prepend hit to a posting list that is in the NOT_FULL state.
  // RETURNS:
  //  - OK, if successful
  //  - INVALID_ARGUMENT if hit is not less than the previously added hit.
  libtextclassifier3::Status PrependHitToNotFull(
      PostingListUsed* posting_list_used, const Hit& hit,
      uint32_t offset) const;

  // Returns either 0 (full state), sizeof(Hit) (almost_full state) or
  // a byte offset between kSpecialHitsSize and
  // posting_list_used->size_in_bytes() (inclusive) (not_full state).
  uint32_t GetStartByteOffset(const PostingListUsed* posting_list_used) const;

  // Sets the special hits to properly reflect what offset is (see layout
  // comment for further details).
  //
  // Returns false if offset > posting_list_used->size_in_bytes() or offset is
  // (kSpecialHitsSize, sizeof(Hit)) or offset is (sizeof(Hit), 0). True,
  // otherwise.
  bool SetStartByteOffset(PostingListUsed* posting_list_used,
                          uint32_t offset) const;

  // Manipulate padded areas. We never store the same hit value twice
  // so a delta of 0 is a pad byte.

  // Returns offset of first non-pad byte.
  uint32_t GetPadEnd(const PostingListUsed* posting_list_used,
                     uint32_t offset) const;

  // Fill padding between offset start and offset end with 0s.
  // Returns false if end > posting_list_used->size_in_bytes(). True,
  // otherwise.
  bool PadToEnd(PostingListUsed* posting_list_used, uint32_t start,
                uint32_t end) const;

  // Helper for AppendHits/PopFrontHits. Adds limit number of hits to out or all
  // hits in the posting list if the posting list contains less than limit
  // number of hits. out can be NULL.
  //
  // NOTE: If called with limit=1, pop=true on a posting list that transitioned
  // from NOT_FULL directly to FULL, GetHitsInternal will not return the posting
  // list to NOT_FULL. Instead it will leave it in a valid state, but it will be
  // ALMOST_FULL.
  //
  // RETURNS:
  //   - OK on success
  //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
  libtextclassifier3::Status GetHitsInternal(
      const PostingListUsed* posting_list_used, uint32_t limit, bool pop,
      std::vector<Hit>* out) const;

  // Retrieves the value stored in the index-th special hit.
  //
  // RETURNS:
  //   - A valid Hit, on success
  //   - INVALID_ARGUMENT if index is not less than kNumSpecialData
  libtextclassifier3::StatusOr<Hit> GetSpecialHit(
      const PostingListUsed* posting_list_used, uint32_t index) const;

  // Sets the value stored in the index-th special hit to val. If index is not
  // less than kSpecialHitSize / sizeof(Hit), this has no effect.
  bool SetSpecialHit(PostingListUsed* posting_list_used, uint32_t index,
                     const Hit& val) const;

  // Prepends hit to the memory region [offset - sizeof(Hit), offset] and
  // returns the new beginning of the padded region.
  //
  // RETURNS:
  //   - The new beginning of the padded region, if successful.
  //   - INVALID_ARGUMENT if hit will not fit (uncompressed) between offset and
  // kSpecialHitsSize
  libtextclassifier3::StatusOr<uint32_t> PrependHitUncompressed(
      PostingListUsed* posting_list_used, const Hit& hit,
      uint32_t offset) const;

  // If hit has a term frequency, consumes the term frequency at offset, updates
  // hit to include the term frequency and updates offset to reflect that the
  // term frequency has been consumed.
  //
  // RETURNS:
  //   - OK, if successful
  //   - INVALID_ARGUMENT if hit has a term frequency and offset +
  //     sizeof(Hit::TermFrequency) >= posting_list_used->size_in_bytes()
  libtextclassifier3::Status ConsumeTermFrequencyIfPresent(
      const PostingListUsed* posting_list_used, Hit* hit,
      uint32_t* offset) const;
};

// Inlined functions. Implementation details below. Avert eyes!
template <class T, Hit (*GetHit)(const T&)>
libtextclassifier3::StatusOr<uint32_t>
PostingListHitSerializer::PrependHitArray(PostingListUsed* posting_list_used,
                                          const T* array, uint32_t num_hits,
                                          bool keep_prepended) const {
  if (!IsPostingListValid(posting_list_used)) {
    return 0;
  }

  // Prepend hits working backwards from array[num_hits - 1].
  uint32_t i;
  for (i = 0; i < num_hits; ++i) {
    if (!PrependHit(posting_list_used, GetHit(array[num_hits - i - 1])).ok()) {
      break;
    }
  }
  if (i != num_hits && !keep_prepended) {
    // Didn't fit. Undo everything and check that we have the same offset as
    // before. PopFrontHits guarantees that it will remove all 'i' hits so long
    // as there are at least 'i' hits in the posting list, which we know there
    // are.
    ICING_RETURN_IF_ERROR(PopFrontHits(posting_list_used, /*num_hits=*/i));
  }
  return i;
}

}  // namespace lib
}  // namespace icing

#endif  // ICING_INDEX_MAIN_POSTING_LIST_HIT_SERIALIZER_H_