src/google/vcencoder.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262

// Copyright 2007 Google Inc.
// Author: Lincoln Smith
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef OPEN_VCDIFF_VCENCODER_H_
#define OPEN_VCDIFF_VCENCODER_H_

#include <stddef.h>  // size_t
#include "google/format_extension_flags.h"
#include "google/output_string.h"

namespace open_vcdiff {

class VCDiffEngine;
class VCDiffStreamingEncoderImpl;

// A HashedDictionary must be constructed from the dictionary data
// in order to use VCDiffStreamingEncoder.  If the same dictionary will
// be used to perform several encoding operations, then the caller should
// create the HashedDictionary once and cache it for reuse.  This object
// is thread-safe: the same const HashedDictionary can be used
// by several threads simultaneously, each with its own VCDiffStreamingEncoder.
//
// dictionary_contents is copied into the HashedDictionary, so the
// caller may free that string, if desired, after the constructor returns.
//
class HashedDictionary {
 public:
  HashedDictionary(const char* dictionary_contents,
                   size_t dictionary_size);
  ~HashedDictionary();

  // Init() must be called before using the HashedDictionary as an argument
  // to the VCDiffStreamingEncoder, or for any other purpose except
  // destruction.  It returns true if initialization succeeded, or false
  // if an error occurred, in which case the caller should destroy the object
  // without using it.
  bool Init();

  const VCDiffEngine* engine() const { return engine_; }

 private:
  const VCDiffEngine* engine_;

  // Make the copy constructor and assignment operator private
  // so that they don't inadvertently get used.
  HashedDictionary(const HashedDictionary&);  // NOLINT
  void operator=(const HashedDictionary&);
};

// The standard streaming interface to the VCDIFF (RFC 3284) encoder.
// "Streaming" in this context means that, even though the entire set of
// input data to be encoded may not be available at once, the encoder
// can produce partial output based on what is available.  Of course,
// the caller should try to maximize the sizes of the data chunks passed
// to the encoder.
class VCDiffStreamingEncoder {
 public:
  // The HashedDictionary object passed to the constructor must remain valid,
  // without being deleted, for the lifetime of the VCDiffStreamingEncoder
  // object.
  //
  // format_extensions allows certain open-vcdiff extensions to the VCDIFF
  // format to be included in the encoded output.  These extensions are not
  // part of the RFC 3284 draft standard, so specifying any extension flags
  // will make the output compatible only with open-vcdiff, or with other
  // VCDIFF implementations that accept these extensions.  See above for an
  // explanation of each possible flag value.
  //
  // *** look_for_target_matches:
  // The VCDIFF format allows COPY instruction addresses to reference data from
  // the source (dictionary), or from previously encoded target data.
  //
  // If look_for_target_matches is false, then the encoder will only
  // produce COPY instructions that reference source data from the dictionary,
  // never from previously encoded target data.  This will speed up the encoding
  // process, but the encoded data will not be as compact.
  //
  // If this value is true, then the encoder will produce COPY instructions
  // that reference either source data or target data.  A COPY instruction from
  // the previously encoded target data may even extend into the range of the
  // data being produced by that same COPY instruction; for example, if the
  // previously encoded target data is "LA", then a single COPY instruction of
  // length 10 can produce the additional target data "LALALALALA".
  //
  // There is a third type of COPY instruction that starts within
  // the source data and extends from the end of the source data
  // into the beginning of the target data.  This VCDIFF encoder will never
  // produce a COPY instruction of this third type (regardless of the value of
  // look_for_target_matches) because the cost of checking for matches
  // across the source-target boundary would not justify its benefits.
  //
  VCDiffStreamingEncoder(const HashedDictionary* dictionary,
                         VCDiffFormatExtensionFlags format_extensions,
                         bool look_for_target_matches);
  ~VCDiffStreamingEncoder();

  // The client should use these routines as follows:
  //    HashedDictionary hd(dictionary, dictionary_size);
  //    if (!hd.Init()) {
  //      HandleError();
  //      return;
  //    }
  //    string output_string;
  //    VCDiffStreamingEncoder v(hd, false, false);
  //    if (!v.StartEncoding(&output_string)) {
  //      HandleError();
  //      return;  // No need to call FinishEncoding()
  //    }
  //    Process(output_string.data(), output_string.size());
  //    output_string.clear();
  //    while (get data_buf) {
  //      if (!v.EncodeChunk(data_buf, data_len, &output_string)) {
  //        HandleError();
  //        return;  // No need to call FinishEncoding()
  //      }
  //      // The encoding is appended to output_string at each call,
  //      // so clear output_string once its contents have been processed.
  //      Process(output_string.data(), output_string.size());
  //      output_string.clear();
  //    }
  //    if (!v.FinishEncoding(&output_string)) {
  //      HandleError();
  //      return;
  //    }
  //    Process(output_string.data(), output_string.size());
  //    output_string.clear();
  //
  // I.e., the allowed pattern of calls is
  //    StartEncoding EncodeChunk* FinishEncoding
  //
  // The size of the encoded output depends on the sizes of the chunks
  // passed in (i.e. the chunking boundary affects compression).
  // However the decoded output is independent of chunk boundaries.

  // Sets up the data structures for encoding.
  // Writes a VCDIFF delta file header (as defined in RFC section 4.1)
  // to *output_string.
  //
  // Note: we *append*, so the old contents of *output_string stick around.
  // This convention differs from the non-streaming Encode/Decode
  // interfaces in VCDiffEncoder.
  //
  // If an error occurs, this function returns false; otherwise it returns true.
  // If this function returns false, the caller does not need to call
  // FinishEncoding or to do any cleanup except destroying the
  // VCDiffStreamingEncoder object.
  template<class OutputType>
  bool StartEncoding(OutputType* output) {
    OutputString<OutputType> output_string(output);
    return StartEncodingToInterface(&output_string);
  }

  bool StartEncodingToInterface(OutputStringInterface* output_string);

  // Appends compressed encoding for "data" (one complete VCDIFF delta window)
  // to *output_string.
  // If an error occurs (for example, if StartEncoding was not called
  // earlier or StartEncoding returned false), this function returns false;
  // otherwise it returns true.  The caller does not need to call FinishEncoding
  // or do any cleanup except destroying the VCDiffStreamingEncoder
  // if this function returns false.
  template<class OutputType>
  bool EncodeChunk(const char* data, size_t len, OutputType* output) {
    OutputString<OutputType> output_string(output);
    return EncodeChunkToInterface(data, len, &output_string);
  }

  bool EncodeChunkToInterface(const char* data, size_t len,
                              OutputStringInterface* output_string);

  // Finishes encoding and appends any leftover encoded data to *output_string.
  // If an error occurs (for example, if StartEncoding was not called
  // earlier or StartEncoding returned false), this function returns false;
  // otherwise it returns true.  The caller does not need to
  // do any cleanup except destroying the VCDiffStreamingEncoder
  // if this function returns false.
  template<class OutputType>
  bool FinishEncoding(OutputType* output) {
    OutputString<OutputType> output_string(output);
    return FinishEncodingToInterface(&output_string);
  }

  bool FinishEncodingToInterface(OutputStringInterface* output_string);

 private:
  VCDiffStreamingEncoderImpl* const impl_;

  // Make the copy constructor and assignment operator private
  // so that they don't inadvertently get used.
  VCDiffStreamingEncoder(const VCDiffStreamingEncoder&);  // NOLINT
  void operator=(const VCDiffStreamingEncoder&);
};

// A simpler (non-streaming) interface to the VCDIFF encoder that can be used
// if the entire target data string is available.
//
class VCDiffEncoder {
 public:
  VCDiffEncoder(const char* dictionary_contents, size_t dictionary_size)
      : dictionary_(dictionary_contents, dictionary_size),
        encoder_(NULL),
        flags_(VCD_STANDARD_FORMAT),
        look_for_target_matches_(true) { }

  ~VCDiffEncoder() {
    delete encoder_;
  }

  // By default, VCDiffEncoder uses standard VCDIFF format.  This function
  // can be used before calling Encode(), to specify that interleaved format
  // and/or checksum format should be used.
  void SetFormatFlags(VCDiffFormatExtensionFlags flags) { flags_ = flags; }

  // By default, VCDiffEncoder looks for matches in the dictionary and also in
  // the previously encoded target data.  This function can be used before
  // calling Encode(), to specify whether or not target matching should be
  // enabled.
  void SetTargetMatching(bool look_for_target_matches) {
    look_for_target_matches_ = look_for_target_matches;
  }

  // Replaces old contents of output_string with the encoded form of
  // target_data.
  template<class OutputType>
  bool Encode(const char* target_data,
              size_t target_len,
              OutputType* output) {
    OutputString<OutputType> output_string(output);
    return EncodeToInterface(target_data, target_len, &output_string);
  }

 private:
  bool EncodeToInterface(const char* target_data,
                         size_t target_len,
                         OutputStringInterface* output_string);

  HashedDictionary dictionary_;
  VCDiffStreamingEncoder* encoder_;
  VCDiffFormatExtensionFlags flags_;
  bool look_for_target_matches_;

  // Make the copy constructor and assignment operator private
  // so that they don't inadvertently get used.
  VCDiffEncoder(const VCDiffEncoder&);  // NOLINT
  void operator=(const VCDiffEncoder&);
};

}  // namespace open_vcdiff

#endif  // OPEN_VCDIFF_VCENCODER_H_