1 files changed, 803 insertions, 0 deletions
diff --git a/icing/join/posting-list-join-data-serializer.h b/icing/join/posting-list-join-data-serializer.h
new file mode 100644
index 0000000..9f39dca
--- /dev/null
+++ b/icing/join/posting-list-join-data-serializer.h
@@ -0,0 +1,803 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_JOIN_POSTING_LIST_JOIN_DATA_SERIALIZER_H_
+#define ICING_JOIN_POSTING_LIST_JOIN_DATA_SERIALIZER_H_
+
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/posting_list/posting-list-common.h"
+#include "icing/file/posting_list/posting-list-used.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+// A serializer class to serialize JoinDataType to PostingListUsed. Usually
+// JoinDataType is DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>,
+// DocumentIdToJoinInfo<TermId>, or DocumentIdToJoinInfo<int64_t>.
+//
+// REQUIRES:
+// - JoinDataType is comparable by operator <.
+// - JoinDataType implements is_valid() method.
+// - JoinDataType has static method GetInvalid() that returns a JoinDataType
+//   instance containing invalid data.
+template <typename JoinDataType>
+class PostingListJoinDataSerializer : public PostingListSerializer {
+ public:
+  using SpecialDataType = SpecialData<JoinDataType>;
+  static_assert(sizeof(SpecialDataType) == sizeof(JoinDataType), "");
+
+  static constexpr uint32_t kSpecialDataSize =
+      kNumSpecialData * sizeof(SpecialDataType);
+
+  uint32_t GetDataTypeBytes() const override { return sizeof(JoinDataType); }
+
+  uint32_t GetMinPostingListSize() const override {
+    static constexpr uint32_t kMinPostingListSize = kSpecialDataSize;
+    static_assert(sizeof(PostingListIndex) <= kMinPostingListSize,
+                  "PostingListIndex must be small enough to fit in a "
+                  "minimum-sized Posting List.");
+
+    return kMinPostingListSize;
+  }
+
+  uint32_t GetMinPostingListSizeToFit(
+      const PostingListUsed* posting_list_used) const override;
+
+  uint32_t GetBytesUsed(
+      const PostingListUsed* posting_list_used) const override;
+
+  void Clear(PostingListUsed* posting_list_used) const override;
+
+  libtextclassifier3::Status MoveFrom(PostingListUsed* dst,
+                                      PostingListUsed* src) const override;
+
+  // Prepend a JoinData to the posting list.
+  //
+  // RETURNS:
+  //   - INVALID_ARGUMENT if !data.is_valid() or if data is not greater than the
+  //     previously added data.
+  //   - RESOURCE_EXHAUSTED if there is no more room to add data to the posting
+  //     list.
+  libtextclassifier3::Status PrependData(PostingListUsed* posting_list_used,
+                                         const JoinDataType& data) const;
+
+  // Prepend multiple JoinData to the posting list.
+  // Data should be sorted in ascending order (as defined by the less than
+  // operator for JoinData)
+  // If keep_prepended is true, whatever could be prepended is kept, otherwise
+  // the posting list is reverted and left in its original state.
+  //
+  // RETURNS:
+  //   The number of data that have been prepended to the posting list. If
+  //   keep_prepended is false and reverted, then it returns 0.
+  libtextclassifier3::StatusOr<uint32_t> PrependDataArray(
+      PostingListUsed* posting_list_used, const JoinDataType* array,
+      uint32_t num_data, bool keep_prepended) const;
+
+  // Retrieves all data stored in the posting list.
+  //
+  // RETURNS:
+  //   - On success, a vector of JoinDataType sorted by the reverse order of
+  //     prepending.
+  //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
+  libtextclassifier3::StatusOr<std::vector<JoinDataType>> GetData(
+      const PostingListUsed* posting_list_used) const;
+
+  // Same as GetData but appends data to data_arr_out.
+  //
+  // RETURNS:
+  //   - OK on success, and data_arr_out will be appended JoinDataType sorted by
+  //     the reverse order of prepending.
+  //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
+  libtextclassifier3::Status GetData(
+      const PostingListUsed* posting_list_used,
+      std::vector<JoinDataType>* data_arr_out) const;
+
+  // Undo the last num_data data prepended. If num_data > number of data, then
+  // we clear all data.
+  //
+  // RETURNS:
+  //   - OK on success
+  //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
+  libtextclassifier3::Status PopFrontData(PostingListUsed* posting_list_used,
+                                          uint32_t num_data) const;
+
+  // Helper function to determine if posting list is full.
+  bool IsFull(const PostingListUsed* posting_list_used) const {
+    return GetSpecialData(posting_list_used, /*index=*/0).data().is_valid() &&
+           GetSpecialData(posting_list_used, /*index=*/1).data().is_valid();
+  }
+
+ private:
+  // In PostingListJoinDataSerializer, there is no compression, but we still use
+  // the traditional posting list implementation.
+  //
+  // Posting list layout formats:
+  //
+  // NOT_FULL
+  // +-special-data-0--+-special-data-1--+------------+-----------------------+
+  // |                 |                 |            |                       |
+  // |data-start-offset|  Data::Invalid  | 0x00000000 |   (compressed) data   |
+  // |                 |                 |            |                       |
+  // +-----------------+-----------------+------------+-----------------------+
+  //
+  // ALMOST_FULL
+  // +-special-data-0--+-special-data-1--+-----+------------------------------+
+  // |                 |                 |     |                              |
+  // |  Data::Invalid  |    1st data     |(pad)|      (compressed) data       |
+  // |                 |                 |     |                              |
+  // +-----------------+-----------------+-----+------------------------------+
+  //
+  // FULL
+  // +-special-data-0--+-special-data-1--+-----+------------------------------+
+  // |                 |                 |     |                              |
+  // |    1st data     |    2nd data     |(pad)|      (compressed) data       |
+  // |                 |                 |     |                              |
+  // +-----------------+-----------------+-----+------------------------------+
+  //
+  // The first two uncompressed (special) data also implicitly encode
+  // information about the size of the compressed data region.
+  //
+  // 1. If the posting list is NOT_FULL, then special_data_0 contains the byte
+  //    offset of the start of the compressed data. Thus, the size of the
+  //    compressed data is
+  //    posting_list_used->size_in_bytes() - special_data_0.data_start_offset().
+  //
+  // 2. If posting list is ALMOST_FULL or FULL, then the compressed data region
+  //    starts somewhere between
+  //    [kSpecialDataSize, kSpecialDataSize + sizeof(JoinDataType) - 1] and ends
+  //    at posting_list_used->size_in_bytes() - 1.
+  //
+  // EXAMPLE
+  // JoinDataType = DocumentIdToJoinInfo<int64_t>. Posting list size: 48 bytes
+  //
+  // EMPTY!
+  // +-- byte 0-11 --+---- 12-23 ----+------------ 24-47 -------------+
+  // |               |               |                                |
+  // |      48       | Data::Invalid |           0x00000000           |
+  // |               |               |                                |
+  // +---------------+---------------+--------------------------------+
+  //
+  // Add DocumentIdToJoinInfo<int64_t>(DocumentId = 12, JoinInteger = 5)
+  // NOT FULL!
+  // +-- byte 0-11 --+---- 12-23 ----+---- 24-35 ----+---- 36-47 ----+
+  // |               |               |               | 12            |
+  // |      36       | Data::Invalid |  0x00000000   |  5            |
+  // |               |               |               |               |
+  // +---------------+---------------+---------------+---------------+
+  //
+  // Add DocumentIdToJoinInfo<int64_t>(DocumentId = 18, JoinInteger = -2)
+  // +-- byte 0-11 --+---- 12-23 ----+---- 24-35 ----+---- 36-47 ----+
+  // |               |               | 18            | 12            |
+  // |      24       | Data::Invalid | -2            |  5            |
+  // |               |               |               |               |
+  // +---------------+---------------+---------------+---------------+
+  //
+  // Add DocumentIdToJoinInfo<int64_t>(DocumentId = 22, JoinInteger = 3)
+  // ALMOST_FULL!
+  // +-- byte 0-11 --+---- 12-23 ----+---- 24-35 ----+---- 36-47 ----+
+  // |               | 22            | 18            | 12            |
+  // | Data::Invalid |  3            | -2            |  5            |
+  // |               |               |               |               |
+  // +---------------+---------------+---------------+---------------+
+  //
+  // Add DocumentIdToJoinInfo<int64_t>(DocumentId = 27, JoinInteger = 0)
+  // FULL!
+  // +-- byte 0-11 --+---- 12-23 ----+---- 24-35 ----+---- 36-47 ----+
+  // | 27            | 22            | 18            | 12            |
+  // |  0            |  3            | -2            |  5            |
+  // |               |               |               |               |
+  // +---------------+---------------+---------------+---------------+
+
+  // Helpers to determine what state the posting list is in.
+  bool IsAlmostFull(const PostingListUsed* posting_list_used) const {
+    return !GetSpecialData(posting_list_used, /*index=*/0).data().is_valid() &&
+           GetSpecialData(posting_list_used, /*index=*/1).data().is_valid();
+  }
+
+  bool IsEmpty(const PostingListUsed* posting_list_used) const {
+    return GetSpecialData(posting_list_used, /*index=*/0).data_start_offset() ==
+               posting_list_used->size_in_bytes() &&
+           !GetSpecialData(posting_list_used, /*index=*/1).data().is_valid();
+  }
+
+  // Returns false if both special data are invalid or if data start offset
+  // stored in the special data is less than kSpecialDataSize or greater than
+  // posting_list_used->size_in_bytes(). Returns true, otherwise.
+  bool IsPostingListValid(const PostingListUsed* posting_list_used) const;
+
+  // Prepend data to a posting list that is in the ALMOST_FULL state.
+  //
+  // RETURNS:
+  //  - OK, if successful
+  //  - INVALID_ARGUMENT if data is not less than the previously added data.
+  libtextclassifier3::Status PrependDataToAlmostFull(
+      PostingListUsed* posting_list_used, const JoinDataType& data) const;
+
+  // Prepend data to a posting list that is in the EMPTY state. This will always
+  // succeed because there are no pre-existing data and no validly constructed
+  // posting list could fail to fit one data.
+  void PrependDataToEmpty(PostingListUsed* posting_list_used,
+                          const JoinDataType& data) const;
+
+  // Prepend data to a posting list that is in the NOT_FULL state.
+  //
+  // RETURNS:
+  //  - OK, if successful
+  //  - INVALID_ARGUMENT if data is not less than the previously added data.
+  libtextclassifier3::Status PrependDataToNotFull(
+      PostingListUsed* posting_list_used, const JoinDataType& data,
+      uint32_t offset) const;
+
+  // Returns either 0 (FULL state), sizeof(JoinDataType) (ALMOST_FULL state) or
+  // a byte offset between kSpecialDataSize and
+  // posting_list_used->size_in_bytes() (inclusive) (NOT_FULL state).
+  uint32_t GetStartByteOffset(const PostingListUsed* posting_list_used) const;
+
+  // Sets special data 0 to properly reflect what start byte offset is (see
+  // layout comment for further details).
+  //
+  // Returns false if offset > posting_list_used->size_in_bytes() or offset is
+  // in range (kSpecialDataSize, sizeof(JoinDataType)) or
+  // (sizeof(JoinDataType), 0). True, otherwise.
+  bool SetStartByteOffset(PostingListUsed* posting_list_used,
+                          uint32_t offset) const;
+
+  // Helper for MoveFrom/GetData/PopFrontData. Adds limit number of data to out
+  // or all data in the posting list if the posting list contains less than
+  // limit number of data. out can be NULL.
+  //
+  // NOTE: If called with limit=1, pop=true on a posting list that transitioned
+  // from NOT_FULL directly to FULL, GetDataInternal will not return the posting
+  // list to NOT_FULL. Instead it will leave it in a valid state, but it will be
+  // ALMOST_FULL.
+  //
+  // RETURNS:
+  //   - OK on success
+  //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
+  libtextclassifier3::Status GetDataInternal(
+      const PostingListUsed* posting_list_used, uint32_t limit, bool pop,
+      std::vector<JoinDataType>* out) const;
+
+  // Retrieves the value stored in the index-th special data.
+  //
+  // REQUIRES:
+  //   0 <= index < kNumSpecialData.
+  //
+  // RETURNS:
+  //   - A valid SpecialData<JoinDataType>.
+  SpecialDataType GetSpecialData(const PostingListUsed* posting_list_used,
+                                 uint32_t index) const;
+
+  // Sets the value stored in the index-th special data to special_data.
+  //
+  // REQUIRES:
+  //   0 <= index < kNumSpecialData.
+  void SetSpecialData(PostingListUsed* posting_list_used, uint32_t index,
+                      const SpecialDataType& special_data) const;
+
+  // Prepends data to the memory region
+  // [offset - sizeof(JoinDataType), offset - 1] and
+  // returns the new beginning of the region.
+  //
+  // RETURNS:
+  //   - The new beginning of the padded region, if successful.
+  //   - INVALID_ARGUMENT if data will not fit (uncompressed) between
+  //       [kSpecialDataSize, offset - 1]
+  libtextclassifier3::StatusOr<uint32_t> PrependDataUncompressed(
+      PostingListUsed* posting_list_used, const JoinDataType& data,
+      uint32_t offset) const;
+};
+
+template <typename JoinDataType>
+uint32_t PostingListJoinDataSerializer<JoinDataType>::GetBytesUsed(
+    const PostingListUsed* posting_list_used) const {
+  // The special data will be included if they represent actual data. If they
+  // represent the data start offset or the invalid data sentinel, they are not
+  // included.
+  return posting_list_used->size_in_bytes() -
+         GetStartByteOffset(posting_list_used);
+}
+
+template <typename JoinDataType>
+uint32_t
+PostingListJoinDataSerializer<JoinDataType>::GetMinPostingListSizeToFit(
+    const PostingListUsed* posting_list_used) const {
+  if (IsFull(posting_list_used) || IsAlmostFull(posting_list_used)) {
+    // If in either the FULL state or ALMOST_FULL state, this posting list *is*
+    // the minimum size posting list that can fit these data. So just return the
+    // size of the posting list.
+    return posting_list_used->size_in_bytes();
+  }
+
+  // In NOT_FULL state, BytesUsed contains no special data. The minimum sized
+  // posting list that would be guaranteed to fit these data would be
+  // ALMOST_FULL, with kInvalidData in special data 0, the uncompressed data in
+  // special data 1 and the n compressed data in the compressed region.
+  // BytesUsed contains one uncompressed data and n compressed data. Therefore,
+  // fitting these data into a posting list would require BytesUsed plus one
+  // extra data.
+  return GetBytesUsed(posting_list_used) + GetDataTypeBytes();
+}
+
+template <typename JoinDataType>
+void PostingListJoinDataSerializer<JoinDataType>::Clear(
+    PostingListUsed* posting_list_used) const {
+  // Safe to ignore return value because posting_list_used->size_in_bytes() is
+  // a valid argument.
+  SetStartByteOffset(posting_list_used,
+                     /*offset=*/posting_list_used->size_in_bytes());
+}
+
+template <typename JoinDataType>
+libtextclassifier3::Status
+PostingListJoinDataSerializer<JoinDataType>::MoveFrom(
+    PostingListUsed* dst, PostingListUsed* src) const {
+  ICING_RETURN_ERROR_IF_NULL(dst);
+  ICING_RETURN_ERROR_IF_NULL(src);
+  if (GetMinPostingListSizeToFit(src) > dst->size_in_bytes()) {
+    return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+        "src MinPostingListSizeToFit %d must be larger than size %d.",
+        GetMinPostingListSizeToFit(src), dst->size_in_bytes()));
+  }
+
+  if (!IsPostingListValid(dst)) {
+    return absl_ports::FailedPreconditionError(
+        "Dst posting list is in an invalid state and can't be used!");
+  }
+  if (!IsPostingListValid(src)) {
+    return absl_ports::InvalidArgumentError(
+        "Cannot MoveFrom an invalid src posting list!");
+  }
+
+  // Pop just enough data that all of src's compressed data fit in
+  // dst posting_list's compressed area. Then we can memcpy that area.
+  std::vector<JoinDataType> data_arr;
+  while (IsFull(src) || IsAlmostFull(src) ||
+         (dst->size_in_bytes() - kSpecialDataSize < GetBytesUsed(src))) {
+    if (!GetDataInternal(src, /*limit=*/1, /*pop=*/true, &data_arr).ok()) {
+      return absl_ports::AbortedError(
+          "Unable to retrieve data from src posting list.");
+    }
+  }
+
+  // memcpy the area and set up start byte offset.
+  Clear(dst);
+  memcpy(dst->posting_list_buffer() + dst->size_in_bytes() - GetBytesUsed(src),
+         src->posting_list_buffer() + GetStartByteOffset(src),
+         GetBytesUsed(src));
+  // Because we popped all data from src outside of the compressed area and we
+  // guaranteed that GetBytesUsed(src) is less than dst->size_in_bytes() -
+  // kSpecialDataSize. This is guaranteed to be a valid byte offset for the
+  // NOT_FULL state, so ignoring the value is safe.
+  SetStartByteOffset(dst, dst->size_in_bytes() - GetBytesUsed(src));
+
+  // Put back remaining data.
+  for (auto riter = data_arr.rbegin(); riter != data_arr.rend(); ++riter) {
+    // PrependData may return:
+    // - INVALID_ARGUMENT: if data is invalid or not less than the previous data
+    // - RESOURCE_EXHAUSTED
+    // RESOURCE_EXHAUSTED should be impossible because we've already assured
+    // that there is enough room above.
+    ICING_RETURN_IF_ERROR(PrependData(dst, *riter));
+  }
+
+  Clear(src);
+  return libtextclassifier3::Status::OK;
+}
+
+template <typename JoinDataType>
+libtextclassifier3::Status
+PostingListJoinDataSerializer<JoinDataType>::PrependDataToAlmostFull(
+    PostingListUsed* posting_list_used, const JoinDataType& data) const {
+  SpecialDataType special_data = GetSpecialData(posting_list_used, /*index=*/1);
+  if (data < special_data.data()) {
+    return absl_ports::InvalidArgumentError(
+        "JoinData being prepended must not be smaller than the most recent "
+        "JoinData");
+  }
+
+  // Without compression, prepend a new data into ALMOST_FULL posting list will
+  // change the posting list to FULL state. Therefore, set special data 0
+  // directly.
+  SetSpecialData(posting_list_used, /*index=*/0, SpecialDataType(data));
+  return libtextclassifier3::Status::OK;
+}
+
+template <typename JoinDataType>
+void PostingListJoinDataSerializer<JoinDataType>::PrependDataToEmpty(
+    PostingListUsed* posting_list_used, const JoinDataType& data) const {
+  // First data to be added. Just add verbatim, no compression.
+  if (posting_list_used->size_in_bytes() == kSpecialDataSize) {
+    // First data will be stored at special data 1.
+    // Safe to ignore the return value because 1 < kNumSpecialData
+    SetSpecialData(posting_list_used, /*index=*/1, SpecialDataType(data));
+    // Safe to ignore the return value because sizeof(JoinDataType) is a valid
+    // argument.
+    SetStartByteOffset(posting_list_used, /*offset=*/sizeof(JoinDataType));
+  } else {
+    // Since this is the first data, size != kSpecialDataSize and
+    // size % sizeof(JoinDataType) == 0, we know that there is room to fit
+    // 'data' into the compressed region, so ValueOrDie is safe.
+    uint32_t offset =
+        PrependDataUncompressed(posting_list_used, data,
+                                /*offset=*/posting_list_used->size_in_bytes())
+            .ValueOrDie();
+    // Safe to ignore the return value because PrependDataUncompressed is
+    // guaranteed to return a valid offset.
+    SetStartByteOffset(posting_list_used, offset);
+  }
+}
+
+template <typename JoinDataType>
+libtextclassifier3::Status
+PostingListJoinDataSerializer<JoinDataType>::PrependDataToNotFull(
+    PostingListUsed* posting_list_used, const JoinDataType& data,
+    uint32_t offset) const {
+  JoinDataType curr = JoinDataType::GetInvalid();
+  memcpy(&curr, posting_list_used->posting_list_buffer() + offset,
+         sizeof(JoinDataType));
+  if (data < curr) {
+    return absl_ports::InvalidArgumentError(
+        "JoinData being prepended must not be smaller than the most recent "
+        "JoinData");
+  }
+
+  if (offset >= kSpecialDataSize + sizeof(JoinDataType)) {
+    offset =
+        PrependDataUncompressed(posting_list_used, data, offset).ValueOrDie();
+    SetStartByteOffset(posting_list_used, offset);
+  } else {
+    // The new data must be put in special data 1.
+    SetSpecialData(posting_list_used, /*index=*/1, SpecialDataType(data));
+    // State ALMOST_FULL. Safe to ignore the return value because
+    // sizeof(JoinDataType) is a valid argument.
+    SetStartByteOffset(posting_list_used, /*offset=*/sizeof(JoinDataType));
+  }
+  return libtextclassifier3::Status::OK;
+}
+
+template <typename JoinDataType>
+libtextclassifier3::Status
+PostingListJoinDataSerializer<JoinDataType>::PrependData(
+    PostingListUsed* posting_list_used, const JoinDataType& data) const {
+  if (!data.is_valid()) {
+    return absl_ports::InvalidArgumentError("Cannot prepend an invalid data!");
+  }
+  if (!IsPostingListValid(posting_list_used)) {
+    return absl_ports::FailedPreconditionError(
+        "This PostingListUsed is in an invalid state and can't add any data!");
+  }
+
+  if (IsFull(posting_list_used)) {
+    // State FULL: no space left.
+    return absl_ports::ResourceExhaustedError("No more room for data");
+  } else if (IsAlmostFull(posting_list_used)) {
+    return PrependDataToAlmostFull(posting_list_used, data);
+  } else if (IsEmpty(posting_list_used)) {
+    PrependDataToEmpty(posting_list_used, data);
+    return libtextclassifier3::Status::OK;
+  } else {
+    uint32_t offset = GetStartByteOffset(posting_list_used);
+    return PrependDataToNotFull(posting_list_used, data, offset);
+  }
+}
+
+template <typename JoinDataType>
+libtextclassifier3::StatusOr<uint32_t>
+PostingListJoinDataSerializer<JoinDataType>::PrependDataArray(
+    PostingListUsed* posting_list_used, const JoinDataType* array,
+    uint32_t num_data, bool keep_prepended) const {
+  if (!IsPostingListValid(posting_list_used)) {
+    return 0;
+  }
+
+  uint32_t i;
+  for (i = 0; i < num_data; ++i) {
+    if (!PrependData(posting_list_used, array[i]).ok()) {
+      break;
+    }
+  }
+  if (i != num_data && !keep_prepended) {
+    // Didn't fit. Undo everything and check that we have the same offset as
+    // before. PopFrontData guarantees that it will remove all 'i' data so long
+    // as there are at least 'i' data in the posting list, which we know there
+    // are.
+    ICING_RETURN_IF_ERROR(PopFrontData(posting_list_used, /*num_data=*/i));
+    return 0;
+  }
+  return i;
+}
+
+template <typename JoinDataType>
+libtextclassifier3::StatusOr<std::vector<JoinDataType>>
+PostingListJoinDataSerializer<JoinDataType>::GetData(
+    const PostingListUsed* posting_list_used) const {
+  std::vector<JoinDataType> data_arr_out;
+  ICING_RETURN_IF_ERROR(GetData(posting_list_used, &data_arr_out));
+  return data_arr_out;
+}
+
+template <typename JoinDataType>
+libtextclassifier3::Status PostingListJoinDataSerializer<JoinDataType>::GetData(
+    const PostingListUsed* posting_list_used,
+    std::vector<JoinDataType>* data_arr_out) const {
+  return GetDataInternal(posting_list_used,
+                         /*limit=*/std::numeric_limits<uint32_t>::max(),
+                         /*pop=*/false, data_arr_out);
+}
+
+template <typename JoinDataType>
+libtextclassifier3::Status
+PostingListJoinDataSerializer<JoinDataType>::PopFrontData(
+    PostingListUsed* posting_list_used, uint32_t num_data) const {
+  if (num_data == 1 && IsFull(posting_list_used)) {
+    // The PL is in FULL state which means that we save 2 uncompressed data in
+    // the 2 special postions. But FULL state may be reached by 2 different
+    // states.
+    // (1) In ALMOST_FULL state
+    // +------------------+-----------------+-----+---------------------------+
+    // |Data::Invalid     |1st data         |(pad)|(compressed) data          |
+    // |                  |                 |     |                           |
+    // +------------------+-----------------+-----+---------------------------+
+    // When we prepend another data, we can only put it at special data 0, and
+    // thus get a FULL PL
+    // +------------------+-----------------+-----+---------------------------+
+    // |new 1st data      |original 1st data|(pad)|(compressed) data          |
+    // |                  |                 |     |                           |
+    // +------------------+-----------------+-----+---------------------------+
+    //
+    // (2) In NOT_FULL state
+    // +------------------+-----------------+-------+---------+---------------+
+    // |data-start-offset |Data::Invalid    |(pad)  |1st data |(compressed)   |
+    // |                  |                 |       |         |data           |
+    // +------------------+-----------------+-------+---------+---------------+
+    // When we prepend another data, we can reach any of the 3 following
+    // scenarios:
+    // (2.1) NOT_FULL
+    // if the space of pad and original 1st data can accommodate the new 1st
+    // data and the encoded delta value.
+    // +------------------+-----------------+-----+--------+------------------+
+    // |data-start-offset |Data::Invalid    |(pad)|new     |(compressed) data |
+    // |                  |                 |     |1st data|                  |
+    // +------------------+-----------------+-----+--------+------------------+
+    // (2.2) ALMOST_FULL
+    // If the space of pad and original 1st data cannot accommodate the new 1st
+    // data and the encoded delta value but can accommodate the encoded delta
+    // value only. We can put the new 1st data at special position 1.
+    // +------------------+-----------------+---------+-----------------------+
+    // |Data::Invalid     |new 1st data     |(pad)    |(compressed) data      |
+    // |                  |                 |         |                       |
+    // +------------------+-----------------+---------+-----------------------+
+    // (2.3) FULL
+    // In very rare case, it cannot even accommodate only the encoded delta
+    // value. we can move the original 1st data into special position 1 and the
+    // new 1st data into special position 0. This may happen because we use
+    // VarInt encoding method which may make the encoded value longer (about
+    // 4/3 times of original)
+    // +------------------+-----------------+--------------+------------------+
+    // |new 1st data      |original 1st data|(pad)         |(compressed) data |
+    // |                  |                 |              |                  |
+    // +------------------+-----------------+--------------+------------------+
+    //
+    // Suppose now the PL is in FULL state. But we don't know whether it arrived
+    // this state from NOT_FULL (like (2.3)) or from ALMOST_FULL (like (1)).
+    // We'll return to ALMOST_FULL state like (1) if we simply pop the new 1st
+    // data, but we want to make the prepending operation "reversible". So
+    // there should be some way to return to NOT_FULL if possible. A simple way
+    // to do is:
+    // - Pop 2 data out of the PL to state ALMOST_FULL or NOT_FULL.
+    // - Add the second data ("original 1st data") back.
+    //
+    // Then we can return to the correct original states of (2.1) or (1). This
+    // makes our prepending operation reversible.
+    std::vector<JoinDataType> out;
+
+    // Popping 2 data should never fail because we've just ensured that the
+    // posting list is in the FULL state.
+    ICING_RETURN_IF_ERROR(
+        GetDataInternal(posting_list_used, /*limit=*/2, /*pop=*/true, &out));
+
+    // PrependData should never fail because:
+    // - out[1] is a valid data less than all previous data in the posting list.
+    // - There's no way that the posting list could run out of room because it
+    //   previously stored these 2 data.
+    ICING_RETURN_IF_ERROR(PrependData(posting_list_used, out[1]));
+  } else if (num_data > 0) {
+    return GetDataInternal(posting_list_used, /*limit=*/num_data, /*pop=*/true,
+                           /*out=*/nullptr);
+  }
+  return libtextclassifier3::Status::OK;
+}
+
+template <typename JoinDataType>
+libtextclassifier3::Status
+PostingListJoinDataSerializer<JoinDataType>::GetDataInternal(
+    const PostingListUsed* posting_list_used, uint32_t limit, bool pop,
+    std::vector<JoinDataType>* out) const {
+  uint32_t offset = GetStartByteOffset(posting_list_used);
+  uint32_t count = 0;
+
+  // First traverse the first two special positions.
+  while (count < limit && offset < kSpecialDataSize) {
+    // offset / sizeof(JoinDataType) < kNumSpecialData
+    // because of the check above.
+    SpecialDataType special_data = GetSpecialData(
+        posting_list_used, /*index=*/offset / sizeof(JoinDataType));
+    if (out != nullptr) {
+      out->push_back(special_data.data());
+    }
+    offset += sizeof(JoinDataType);
+    ++count;
+  }
+
+  // - We don't compress the data.
+  // - The posting list size is a multiple of data type bytes.
+  // So offset of the first non-special data is guaranteed to be at
+  // kSpecialDataSize if in ALMOST_FULL or FULL state. In fact, we must not
+  // apply padding skipping logic here when still storing uncompressed data,
+  // because in this case 0 bytes are meanful (e.g. inverted doc id byte = 0).
+  while (count < limit && offset < posting_list_used->size_in_bytes()) {
+    JoinDataType data = JoinDataType::GetInvalid();
+    memcpy(&data, posting_list_used->posting_list_buffer() + offset,
+           sizeof(JoinDataType));
+    offset += sizeof(JoinDataType);
+    if (out != nullptr) {
+      out->push_back(data);
+    }
+    ++count;
+  }
+
+  if (pop) {
+    PostingListUsed* mutable_posting_list_used =
+        const_cast<PostingListUsed*>(posting_list_used);
+    // Modify the posting list so that we pop all data actually traversed.
+    if (offset >= kSpecialDataSize &&
+        offset < posting_list_used->size_in_bytes()) {
+      memset(
+          mutable_posting_list_used->posting_list_buffer() + kSpecialDataSize,
+          0, offset - kSpecialDataSize);
+    }
+    SetStartByteOffset(mutable_posting_list_used, offset);
+  }
+
+  return libtextclassifier3::Status::OK;
+}
+
+template <typename JoinDataType>
+typename PostingListJoinDataSerializer<JoinDataType>::SpecialDataType
+PostingListJoinDataSerializer<JoinDataType>::GetSpecialData(
+    const PostingListUsed* posting_list_used, uint32_t index) const {
+  // It is ok to temporarily construct a SpecialData with offset = 0 since we're
+  // going to overwrite it by memcpy.
+  SpecialDataType special_data(0);
+  memcpy(&special_data,
+         posting_list_used->posting_list_buffer() +
+             index * sizeof(SpecialDataType),
+         sizeof(SpecialDataType));
+  return special_data;
+}
+
+template <typename JoinDataType>
+void PostingListJoinDataSerializer<JoinDataType>::SetSpecialData(
+    PostingListUsed* posting_list_used, uint32_t index,
+    const SpecialDataType& special_data) const {
+  memcpy(posting_list_used->posting_list_buffer() +
+             index * sizeof(SpecialDataType),
+         &special_data, sizeof(SpecialDataType));
+}
+
+template <typename JoinDataType>
+bool PostingListJoinDataSerializer<JoinDataType>::IsPostingListValid(
+    const PostingListUsed* posting_list_used) const {
+  if (IsAlmostFull(posting_list_used)) {
+    // Special data 1 should hold a valid data.
+    if (!GetSpecialData(posting_list_used, /*index=*/1).data().is_valid()) {
+      ICING_LOG(ERROR)
+          << "Both special data cannot be invalid at the same time.";
+      return false;
+    }
+  } else if (!IsFull(posting_list_used)) {
+    // NOT_FULL. Special data 0 should hold a valid offset.
+    SpecialDataType special_data =
+        GetSpecialData(posting_list_used, /*index=*/0);
+    if (special_data.data_start_offset() > posting_list_used->size_in_bytes() ||
+        special_data.data_start_offset() < kSpecialDataSize) {
+      ICING_LOG(ERROR) << "Offset: " << special_data.data_start_offset()
+                       << " size: " << posting_list_used->size_in_bytes()
+                       << " sp size: " << kSpecialDataSize;
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename JoinDataType>
+uint32_t PostingListJoinDataSerializer<JoinDataType>::GetStartByteOffset(
+    const PostingListUsed* posting_list_used) const {
+  if (IsFull(posting_list_used)) {
+    return 0;
+  } else if (IsAlmostFull(posting_list_used)) {
+    return sizeof(JoinDataType);
+  } else {
+    return GetSpecialData(posting_list_used, /*index=*/0).data_start_offset();
+  }
+}
+
+template <typename JoinDataType>
+bool PostingListJoinDataSerializer<JoinDataType>::SetStartByteOffset(
+    PostingListUsed* posting_list_used, uint32_t offset) const {
+  if (offset > posting_list_used->size_in_bytes()) {
+    ICING_LOG(ERROR) << "offset cannot be a value greater than size "
+                     << posting_list_used->size_in_bytes() << ". offset is "
+                     << offset << ".";
+    return false;
+  }
+  if (offset < kSpecialDataSize && offset > sizeof(JoinDataType)) {
+    ICING_LOG(ERROR) << "offset cannot be a value between ("
+                     << sizeof(JoinDataType) << ", " << kSpecialDataSize
+                     << "). offset is " << offset << ".";
+    return false;
+  }
+  if (offset < sizeof(JoinDataType) && offset != 0) {
+    ICING_LOG(ERROR) << "offset cannot be a value between (0, "
+                     << sizeof(JoinDataType) << "). offset is " << offset
+                     << ".";
+    return false;
+  }
+
+  if (offset >= kSpecialDataSize) {
+    // NOT_FULL state.
+    SetSpecialData(posting_list_used, /*index=*/0, SpecialDataType(offset));
+    SetSpecialData(posting_list_used, /*index=*/1,
+                   SpecialDataType(JoinDataType::GetInvalid()));
+  } else if (offset == sizeof(JoinDataType)) {
+    // ALMOST_FULL state.
+    SetSpecialData(posting_list_used, /*index=*/0,
+                   SpecialDataType(JoinDataType::GetInvalid()));
+  }
+  // Nothing to do for the FULL state - the offset isn't actually stored
+  // anywhere and both 2 special data hold valid data.
+  return true;
+}
+
+template <typename JoinDataType>
+libtextclassifier3::StatusOr<uint32_t>
+PostingListJoinDataSerializer<JoinDataType>::PrependDataUncompressed(
+    PostingListUsed* posting_list_used, const JoinDataType& data,
+    uint32_t offset) const {
+  if (offset < kSpecialDataSize + sizeof(JoinDataType)) {
+    return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+        "Not enough room to prepend JoinData at offset %d.", offset));
+  }
+  offset -= sizeof(JoinDataType);
+  memcpy(posting_list_used->posting_list_buffer() + offset, &data,
+         sizeof(JoinDataType));
+  return offset;
+}
+
+}  // namespace lib
+}  // namespace icing
+
+#endif  // ICING_JOIN_POSTING_LIST_JOIN_DATA_SERIALIZER_H_