aboutsummaryrefslogtreecommitdiff
path: root/src/encodetable.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/encodetable.cc')
-rw-r--r--src/encodetable.cc392
1 files changed, 392 insertions, 0 deletions
diff --git a/src/encodetable.cc b/src/encodetable.cc
new file mode 100644
index 0000000..9c4001c
--- /dev/null
+++ b/src/encodetable.cc
@@ -0,0 +1,392 @@
+// Copyright 2008 Google Inc.
+// Author: Lincoln Smith
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <config.h>
+#include <limits.h> // UCHAR_MAX
+#include <string>
+#include "addrcache.h"
+#include "codetable.h"
+#include "encodetable.h"
+#include "instruction_map.h"
+#include "logging.h"
+#include "google/output_string.h"
+#include "varint_bigendian.h"
+#include "vcdiff_defs.h"
+
+namespace open_vcdiff {
+
+static const DeltaFileHeader kHeaderStandardFormat = {
+ 0xD6, // Header1: "V" | 0x80
+ 0xC3, // Header2: "C" | 0x80
+ 0xC4, // Header3: "D" | 0x80
+ 0x00, // Header4: Draft standard format
+ 0x00 }; // Hdr_Indicator:
+ // No compression, no custom code table
+
+static const DeltaFileHeader kHeaderExtendedFormat = {
+ 0xD6, // Header1: "V" | 0x80
+ 0xC3, // Header2: "C" | 0x80
+ 0xC4, // Header3: "D" | 0x80
+ 'S', // Header4: VCDIFF/SDCH, extensions used
+ 0x00 }; // Hdr_Indicator:
+ // No compression, no custom code table
+
+// VCDiffCodeTableWriter members and methods
+
+// If interleaved is true, the encoder writes each delta file window
+// by interleaving instructions and sizes with their corresponding
+// addresses and data, rather than placing these elements into three
+// separate sections. This facilitates providing partially
+// decoded results when only a portion of a delta file window
+// is received (e.g. when HTTP over TCP is used as the
+// transmission protocol.) The interleaved format is
+// not consistent with the VCDIFF draft standard.
+//
+VCDiffCodeTableWriter::VCDiffCodeTableWriter(bool interleaved)
+ : max_mode_(VCDiffAddressCache::DefaultLastMode()),
+ dictionary_size_(0),
+ target_length_(0),
+ code_table_data_(&VCDiffCodeTableData::kDefaultCodeTableData),
+ instruction_map_(NULL),
+ last_opcode_index_(-1),
+ add_checksum_(false),
+ checksum_(0) {
+ InitSectionPointers(interleaved);
+}
+
+VCDiffCodeTableWriter::VCDiffCodeTableWriter(
+ bool interleaved,
+ int near_cache_size,
+ int same_cache_size,
+ const VCDiffCodeTableData& code_table_data,
+ unsigned char max_mode)
+ : max_mode_(max_mode),
+ address_cache_(near_cache_size, same_cache_size),
+ dictionary_size_(0),
+ target_length_(0),
+ code_table_data_(&code_table_data),
+ instruction_map_(NULL),
+ last_opcode_index_(-1),
+ add_checksum_(false),
+ checksum_(0) {
+ InitSectionPointers(interleaved);
+}
+
+VCDiffCodeTableWriter::~VCDiffCodeTableWriter() {
+ if (code_table_data_ != &VCDiffCodeTableData::kDefaultCodeTableData) {
+ delete instruction_map_;
+ }
+}
+
+void VCDiffCodeTableWriter::InitSectionPointers(bool interleaved) {
+ if (interleaved) {
+ data_for_add_and_run_ = &instructions_and_sizes_;
+ addresses_for_copy_ = &instructions_and_sizes_;
+ } else {
+ data_for_add_and_run_ = &separate_data_for_add_and_run_;
+ addresses_for_copy_ = &separate_addresses_for_copy_;
+ }
+}
+
+bool VCDiffCodeTableWriter::Init(size_t dictionary_size) {
+ dictionary_size_ = dictionary_size;
+ if (!instruction_map_) {
+ if (code_table_data_ == &VCDiffCodeTableData::kDefaultCodeTableData) {
+ instruction_map_ = VCDiffInstructionMap::GetDefaultInstructionMap();
+ } else {
+ instruction_map_ = new VCDiffInstructionMap(*code_table_data_, max_mode_);
+ }
+ if (!instruction_map_) {
+ return false;
+ }
+ }
+ if (!address_cache_.Init()) {
+ return false;
+ }
+ target_length_ = 0;
+ last_opcode_index_ = -1;
+ return true;
+}
+
+void VCDiffCodeTableWriter::WriteHeader(
+ OutputStringInterface* out,
+ VCDiffFormatExtensionFlags format_extensions) {
+ if (format_extensions == VCD_STANDARD_FORMAT) {
+ out->append(reinterpret_cast<const char*>(&kHeaderStandardFormat),
+ sizeof(kHeaderStandardFormat));
+ } else {
+ out->append(reinterpret_cast<const char*>(&kHeaderExtendedFormat),
+ sizeof(kHeaderExtendedFormat));
+ }
+ // If custom cache table sizes or a custom code table were used
+ // for encoding, here is where they would be appended to *output.
+ // This implementation of the encoder does not use those features,
+ // although the decoder can understand and interpret them.
+}
+
+// The VCDiff format allows each opcode to represent either
+// one or two delta instructions. This function will first
+// examine the opcode generated by the last call to EncodeInstruction.
+// If that opcode was a single-instruction opcode, this function checks
+// whether there is a compound (double-instruction) opcode that can
+// combine that single instruction with the instruction that is now
+// being added, and so save a byte of space. In that case, the
+// single-instruction opcode at position last_opcode_index_ will be
+// overwritten with the new double-instruction opcode.
+//
+// In the majority of cases, no compound opcode will be possible,
+// and a new single-instruction opcode will be appended to
+// instructions_and_sizes_, followed by a representation of its size
+// if the opcode does not implicitly give the instruction size.
+//
+// As an example, say instructions_and_sizes_ contains 10 bytes, the last
+// of which contains the opcode 0x02 (ADD size 1). Because that was the
+// most recently added opcode, last_opcode_index_ has the value 10.
+// EncodeInstruction is then called with inst = VCD_COPY, size = 4, mode = 0.
+// The function will replace the old opcode 0x02 with the double-instruction
+// opcode 0xA3 (ADD size 1 + COPY size 4 mode 0).
+//
+// All of the double-instruction opcodes in the standard code table
+// have implicit sizes, meaning that the size of the instruction will not
+// need to be written to the instructions_and_sizes_ string separately
+// from the opcode. If a custom code table were used that did not have
+// this property, then instructions_and_sizes_ might contain a
+// double-instruction opcode (say, COPY size 0 mode 0 + ADD size 0)
+// followed by the size of the COPY, then by the size of the ADD.
+// If using the SDCH interleaved format, the address of the COPY instruction
+// would follow its size, so the ordering would be
+// [Compound Opcode][Size of COPY][Address of COPY][Size of ADD]
+//
+void VCDiffCodeTableWriter::EncodeInstruction(VCDiffInstructionType inst,
+ size_t size,
+ unsigned char mode) {
+ if (!instruction_map_) {
+ VCD_DFATAL << "EncodeInstruction() called without calling Init()"
+ << VCD_ENDL;
+ return;
+ }
+ if (last_opcode_index_ >= 0) {
+ const unsigned char last_opcode =
+ instructions_and_sizes_[last_opcode_index_];
+ // The encoding engine should not generate two ADD instructions in a row.
+ // This won't cause a failure, but it's inefficient encoding and probably
+ // represents a bug in the higher-level logic of the encoder.
+ if ((inst == VCD_ADD) &&
+ (code_table_data_->inst1[last_opcode] == VCD_ADD)) {
+ VCD_WARNING << "EncodeInstruction() called for two ADD instructions"
+ " in a row" << VCD_ENDL;
+ }
+ OpcodeOrNone compound_opcode = kNoOpcode;
+ if (size <= UCHAR_MAX) {
+ compound_opcode =
+ instruction_map_->LookupSecondOpcode(last_opcode,
+ inst,
+ static_cast<unsigned char>(size),
+ mode);
+ if (compound_opcode != kNoOpcode) {
+ instructions_and_sizes_[last_opcode_index_] =
+ static_cast<unsigned char>(compound_opcode);
+ last_opcode_index_ = -1;
+ return;
+ }
+ }
+ // Try finding a compound opcode with size 0.
+ compound_opcode = instruction_map_->LookupSecondOpcode(last_opcode,
+ inst,
+ 0,
+ mode);
+ if (compound_opcode != kNoOpcode) {
+ instructions_and_sizes_[last_opcode_index_] =
+ static_cast<unsigned char>(compound_opcode);
+ last_opcode_index_ = -1;
+ AppendSizeToString(size, &instructions_and_sizes_);
+ return;
+ }
+ }
+ OpcodeOrNone opcode = kNoOpcode;
+ if (size <= UCHAR_MAX) {
+ opcode =
+ instruction_map_->LookupFirstOpcode(inst,
+ static_cast<unsigned char>(size),
+ mode);
+ if (opcode != kNoOpcode) {
+ instructions_and_sizes_.push_back(static_cast<char>(opcode));
+ last_opcode_index_ = static_cast<int>(instructions_and_sizes_.size() - 1);
+ return;
+ }
+ }
+ // There should always be an opcode with size 0.
+ opcode = instruction_map_->LookupFirstOpcode(inst, 0, mode);
+ if (opcode == kNoOpcode) {
+ VCD_DFATAL << "No matching opcode found for inst " << inst
+ << ", mode " << mode << ", size 0" << VCD_ENDL;
+ return;
+ }
+ instructions_and_sizes_.push_back(static_cast<char>(opcode));
+ last_opcode_index_ = static_cast<int>(instructions_and_sizes_.size() - 1);
+ AppendSizeToString(size, &instructions_and_sizes_);
+}
+
+void VCDiffCodeTableWriter::Add(const char* data, size_t size) {
+ EncodeInstruction(VCD_ADD, size);
+ data_for_add_and_run_->append(data, size);
+ target_length_ += size;
+}
+
+void VCDiffCodeTableWriter::Copy(int32_t offset, size_t size) {
+ if (!instruction_map_) {
+ VCD_DFATAL << "VCDiffCodeTableWriter::Copy() called without calling Init()"
+ << VCD_ENDL;
+ return;
+ }
+ // If a single interleaved stream of encoded values is used
+ // instead of separate sections for instructions, addresses, and data,
+ // then the string instructions_and_sizes_ may be the same as
+ // addresses_for_copy_. The address should therefore be encoded
+ // *after* the instruction and its size.
+ int32_t encoded_addr = 0;
+ const unsigned char mode = address_cache_.EncodeAddress(
+ offset,
+ static_cast<VCDAddress>(dictionary_size_ + target_length_),
+ &encoded_addr);
+ EncodeInstruction(VCD_COPY, size, mode);
+ if (address_cache_.WriteAddressAsVarintForMode(mode)) {
+ VarintBE<int32_t>::AppendToString(encoded_addr, addresses_for_copy_);
+ } else {
+ addresses_for_copy_->push_back(static_cast<unsigned char>(encoded_addr));
+ }
+ target_length_ += size;
+}
+
+void VCDiffCodeTableWriter::Run(size_t size, unsigned char byte) {
+ EncodeInstruction(VCD_RUN, size);
+ data_for_add_and_run_->push_back(byte);
+ target_length_ += size;
+}
+
+size_t VCDiffCodeTableWriter::CalculateLengthOfSizeAsVarint(size_t size) {
+ return VarintBE<int32_t>::Length(static_cast<int32_t>(size));
+}
+
+void VCDiffCodeTableWriter::AppendSizeToString(size_t size, string* out) {
+ VarintBE<int32_t>::AppendToString(static_cast<int32_t>(size), out);
+}
+
+void VCDiffCodeTableWriter::AppendSizeToOutputString(
+ size_t size,
+ OutputStringInterface* out) {
+ VarintBE<int32_t>::AppendToOutputString(static_cast<int32_t>(size), out);
+}
+
+// This calculation must match the items added between "Start of Delta Encoding"
+// and "End of Delta Encoding" in Output(), below.
+size_t VCDiffCodeTableWriter::CalculateLengthOfTheDeltaEncoding() const {
+ size_t length_of_the_delta_encoding =
+ CalculateLengthOfSizeAsVarint(target_length_) +
+ 1 + // Delta_Indicator
+ CalculateLengthOfSizeAsVarint(separate_data_for_add_and_run_.size()) +
+ CalculateLengthOfSizeAsVarint(instructions_and_sizes_.size()) +
+ CalculateLengthOfSizeAsVarint(separate_addresses_for_copy_.size()) +
+ separate_data_for_add_and_run_.size() +
+ instructions_and_sizes_.size() +
+ separate_addresses_for_copy_.size();
+ if (add_checksum_) {
+ length_of_the_delta_encoding +=
+ VarintBE<int64_t>::Length(static_cast<int64_t>(checksum_));
+ }
+ return length_of_the_delta_encoding;
+}
+
+void VCDiffCodeTableWriter::Output(OutputStringInterface* out) {
+ if (instructions_and_sizes_.empty()) {
+ VCD_WARNING << "Empty input; no delta window produced" << VCD_ENDL;
+ } else {
+ const size_t length_of_the_delta_encoding =
+ CalculateLengthOfTheDeltaEncoding();
+ const size_t delta_window_size =
+ length_of_the_delta_encoding +
+ 1 + // Win_Indicator
+ CalculateLengthOfSizeAsVarint(dictionary_size_) +
+ CalculateLengthOfSizeAsVarint(0) +
+ CalculateLengthOfSizeAsVarint(length_of_the_delta_encoding);
+ // append() will be called many times on the output string; make sure
+ // the output string is resized only once at most.
+ out->ReserveAdditionalBytes(delta_window_size);
+
+ // Add first element: Win_Indicator
+ if (add_checksum_) {
+ out->push_back(VCD_SOURCE | VCD_CHECKSUM);
+ } else {
+ out->push_back(VCD_SOURCE);
+ }
+ // Source segment size: dictionary size
+ AppendSizeToOutputString(dictionary_size_, out);
+ // Source segment position: 0 (start of dictionary)
+ AppendSizeToOutputString(0, out);
+
+ // [Here is where a secondary compressor would be used
+ // if the encoder and decoder supported that feature.]
+
+ AppendSizeToOutputString(length_of_the_delta_encoding, out);
+ // Start of Delta Encoding
+ const size_t size_before_delta_encoding = out->size();
+ AppendSizeToOutputString(target_length_, out);
+ out->push_back(0x00); // Delta_Indicator: no compression
+ AppendSizeToOutputString(separate_data_for_add_and_run_.size(), out);
+ AppendSizeToOutputString(instructions_and_sizes_.size(), out);
+ AppendSizeToOutputString(separate_addresses_for_copy_.size(), out);
+ if (add_checksum_) {
+ // The checksum is a 32-bit *unsigned* integer. VarintBE requires a
+ // signed type, so use a 64-bit signed integer to store the checksum.
+ VarintBE<int64_t>::AppendToOutputString(static_cast<int64_t>(checksum_),
+ out);
+ }
+ out->append(separate_data_for_add_and_run_.data(),
+ separate_data_for_add_and_run_.size());
+ out->append(instructions_and_sizes_.data(),
+ instructions_and_sizes_.size());
+ out->append(separate_addresses_for_copy_.data(),
+ separate_addresses_for_copy_.size());
+ // End of Delta Encoding
+ const size_t size_after_delta_encoding = out->size();
+ if (length_of_the_delta_encoding !=
+ (size_after_delta_encoding - size_before_delta_encoding)) {
+ VCD_DFATAL << "Internal error: calculated length of the delta encoding ("
+ << length_of_the_delta_encoding
+ << ") does not match actual length ("
+ << (size_after_delta_encoding - size_before_delta_encoding)
+ << VCD_ENDL;
+ }
+ separate_data_for_add_and_run_.clear();
+ instructions_and_sizes_.clear();
+ separate_addresses_for_copy_.clear();
+ if (target_length_ == 0) {
+ VCD_WARNING << "Empty target window" << VCD_ENDL;
+ }
+ }
+
+ // Reset state for next window; assume we are using same code table
+ // and dictionary. The caller will have to invoke Init() if a different
+ // dictionary is used.
+ //
+ // Notably, Init() calls address_cache_.Init(). This resets the address
+ // cache between delta windows, as required by RFC section 5.1.
+ if (!Init(dictionary_size_)) {
+ VCD_DFATAL << "Internal error: calling Init() to reset "
+ "VCDiffCodeTableWriter state failed" << VCD_ENDL;
+ }
+}
+
+}; // namespace open_vcdiff