aboutsummaryrefslogtreecommitdiff
path: root/src/include/fst/extensions/far/sttable.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/include/fst/extensions/far/sttable.h')
-rw-r--r--src/include/fst/extensions/far/sttable.h370
1 files changed, 370 insertions, 0 deletions
diff --git a/src/include/fst/extensions/far/sttable.h b/src/include/fst/extensions/far/sttable.h
new file mode 100644
index 0000000..3a03133
--- /dev/null
+++ b/src/include/fst/extensions/far/sttable.h
@@ -0,0 +1,370 @@
+// sttable.h
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Copyright 2005-2010 Google, Inc.
+// Author: allauzen@google.com (Cyril Allauzen)
+//
+// \file
+// A generic string-to-type table file format
+//
+// This is not meant as a generalization of SSTable. This is more of
+// a simple replacement for SSTable in order to provide an open-source
+// implementation of the FAR format for the external version of the
+// FST Library.
+
+#ifndef FST_EXTENSIONS_FAR_STTABLE_H_
+#define FST_EXTENSIONS_FAR_STTABLE_H_
+
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+#include <fst/util.h>
+
+namespace fst {
+
+static const int32 kSTTableMagicNumber = 2125656924;
+static const int32 kSTTableFileVersion = 1;
+
+// String-to-type table writing class for object of type 'T' using functor 'W'
+// to write an object of type 'T' from a stream. 'W' must conform to the
+// following interface:
+//
+// struct Writer {
+// void operator()(ostream &, const T &) const;
+// };
+//
+template <class T, class W>
+class STTableWriter {
+ public:
+ typedef T EntryType;
+ typedef W EntryWriter;
+
+ explicit STTableWriter(const string &filename)
+ : stream_(filename.c_str(), ofstream::out | ofstream::binary),
+ error_(false) {
+ WriteType(stream_, kSTTableMagicNumber);
+ WriteType(stream_, kSTTableFileVersion);
+ if (!stream_) {
+ FSTERROR() << "STTableWriter::STTableWriter: error writing to file: "
+ << filename;
+ error_=true;
+ }
+ }
+
+ static STTableWriter<T, W> *Create(const string &filename) {
+ if (filename.empty()) {
+ LOG(ERROR) << "STTableWriter: writing to standard out unsupported.";
+ return 0;
+ }
+ return new STTableWriter<T, W>(filename);
+ }
+
+ void Add(const string &key, const T &t) {
+ if (key == "") {
+ FSTERROR() << "STTableWriter::Add: key empty: " << key;
+ error_ = true;
+ } else if (key < last_key_) {
+ FSTERROR() << "STTableWriter::Add: key disorder: " << key;
+ error_ = true;
+ }
+ if (error_) return;
+ last_key_ = key;
+ positions_.push_back(stream_.tellp());
+ WriteType(stream_, key);
+ entry_writer_(stream_, t);
+ }
+
+ bool Error() const { return error_; }
+
+ ~STTableWriter() {
+ WriteType(stream_, positions_);
+ WriteType(stream_, static_cast<int64>(positions_.size()));
+ }
+
+ private:
+ EntryWriter entry_writer_; // Write functor for 'EntryType'
+ ofstream stream_; // Output stream
+ vector<int64> positions_; // Position in file of each key-entry pair
+ string last_key_; // Last key
+ bool error_;
+
+ DISALLOW_COPY_AND_ASSIGN(STTableWriter);
+};
+
+
+// String-to-type table reading class for object of type 'T' using functor 'R'
+// to read an object of type 'T' form a stream. 'R' must conform to the
+// following interface:
+//
+// struct Reader {
+// T *operator()(istream &) const;
+// };
+//
+template <class T, class R>
+class STTableReader {
+ public:
+ typedef T EntryType;
+ typedef R EntryReader;
+
+ explicit STTableReader(const vector<string> &filenames)
+ : sources_(filenames), entry_(0), error_(false) {
+ compare_ = new Compare(&keys_);
+ keys_.resize(filenames.size());
+ streams_.resize(filenames.size(), 0);
+ positions_.resize(filenames.size());
+ for (size_t i = 0; i < filenames.size(); ++i) {
+ streams_[i] = new ifstream(
+ filenames[i].c_str(), ifstream::in | ifstream::binary);
+ int32 magic_number = 0, file_version = 0;
+ ReadType(*streams_[i], &magic_number);
+ ReadType(*streams_[i], &file_version);
+ if (magic_number != kSTTableMagicNumber) {
+ FSTERROR() << "STTableReader::STTableReader: wrong file type: "
+ << filenames[i];
+ error_ = true;
+ return;
+ }
+ if (file_version != kSTTableFileVersion) {
+ FSTERROR() << "STTableReader::STTableReader: wrong file version: "
+ << filenames[i];
+ error_ = true;
+ return;
+ }
+ int64 num_entries;
+ streams_[i]->seekg(-static_cast<int>(sizeof(int64)), ios_base::end);
+ ReadType(*streams_[i], &num_entries);
+ streams_[i]->seekg(-static_cast<int>(sizeof(int64)) *
+ (num_entries + 1), ios_base::end);
+ positions_[i].resize(num_entries);
+ for (size_t j = 0; (j < num_entries) && (*streams_[i]); ++j)
+ ReadType(*streams_[i], &(positions_[i][j]));
+ streams_[i]->seekg(positions_[i][0]);
+ if (!*streams_[i]) {
+ FSTERROR() << "STTableReader::STTableReader: error reading file: "
+ << filenames[i];
+ error_ = true;
+ return;
+ }
+
+ }
+ MakeHeap();
+ }
+
+ ~STTableReader() {
+ for (size_t i = 0; i < streams_.size(); ++i)
+ delete streams_[i];
+ delete compare_;
+ if (entry_)
+ delete entry_;
+ }
+
+ static STTableReader<T, R> *Open(const string &filename) {
+ if (filename.empty()) {
+ LOG(ERROR) << "STTableReader: reading from standard in not supported";
+ return 0;
+ }
+ vector<string> filenames;
+ filenames.push_back(filename);
+ return new STTableReader<T, R>(filenames);
+ }
+
+ static STTableReader<T, R> *Open(const vector<string> &filenames) {
+ return new STTableReader<T, R>(filenames);
+ }
+
+ void Reset() {
+ if (error_) return;
+ for (size_t i = 0; i < streams_.size(); ++i)
+ streams_[i]->seekg(positions_[i].front());
+ MakeHeap();
+ }
+
+ bool Find(const string &key) {
+ if (error_) return false;
+ for (size_t i = 0; i < streams_.size(); ++i)
+ LowerBound(i, key);
+ MakeHeap();
+ return keys_[current_] == key;
+ }
+
+ bool Done() const { return error_ || heap_.empty(); }
+
+ void Next() {
+ if (error_) return;
+ if (streams_[current_]->tellg() <= positions_[current_].back()) {
+ ReadType(*(streams_[current_]), &(keys_[current_]));
+ if (!*streams_[current_]) {
+ FSTERROR() << "STTableReader: error reading file: "
+ << sources_[current_];
+ error_ = true;
+ return;
+ }
+ push_heap(heap_.begin(), heap_.end(), *compare_);
+ } else {
+ heap_.pop_back();
+ }
+ if (!heap_.empty())
+ PopHeap();
+ }
+
+ const string &GetKey() const {
+ return keys_[current_];
+ }
+
+ const EntryType &GetEntry() const {
+ return *entry_;
+ }
+
+ bool Error() const { return error_; }
+
+ private:
+ // Comparison functor used to compare stream IDs in the heap
+ struct Compare {
+ Compare(const vector<string> *keys) : keys_(keys) {}
+
+ bool operator()(size_t i, size_t j) const {
+ return (*keys_)[i] > (*keys_)[j];
+ };
+
+ private:
+ const vector<string> *keys_;
+ };
+
+ // Position the stream with ID 'id' at the position corresponding
+ // to the lower bound for key 'find_key'
+ void LowerBound(size_t id, const string &find_key) {
+ ifstream *strm = streams_[id];
+ const vector<int64> &positions = positions_[id];
+ size_t low = 0, high = positions.size() - 1;
+
+ while (low < high) {
+ size_t mid = (low + high)/2;
+ strm->seekg(positions[mid]);
+ string key;
+ ReadType(*strm, &key);
+ if (key > find_key) {
+ high = mid;
+ } else if (key < find_key) {
+ low = mid + 1;
+ } else {
+ for (size_t i = mid; i > low; --i) {
+ strm->seekg(positions[i - 1]);
+ ReadType(*strm, &key);
+ if (key != find_key) {
+ strm->seekg(positions[i]);
+ return;
+ }
+ }
+ strm->seekg(positions[low]);
+ return;
+ }
+ }
+ strm->seekg(positions[low]);
+ }
+
+ // Add all streams to the heap
+ void MakeHeap() {
+ heap_.clear();
+ for (size_t i = 0; i < streams_.size(); ++i) {
+ ReadType(*streams_[i], &(keys_[i]));
+ if (!*streams_[i]) {
+ FSTERROR() << "STTableReader: error reading file: " << sources_[i];
+ error_ = true;
+ return;
+ }
+ heap_.push_back(i);
+ }
+ make_heap(heap_.begin(), heap_.end(), *compare_);
+ PopHeap();
+ }
+
+ // Position the stream with the lowest key at the top
+ // of the heap, set 'current_' to the ID of that stream
+ // and read the current entry from that stream
+ void PopHeap() {
+ pop_heap(heap_.begin(), heap_.end(), *compare_);
+ current_ = heap_.back();
+ if (entry_)
+ delete entry_;
+ entry_ = entry_reader_(*streams_[current_]);
+ if (!entry_)
+ error_ = true;
+ if (!*streams_[current_]) {
+ FSTERROR() << "STTableReader: error reading entry for key: "
+ << keys_[current_] << ", file: " << sources_[current_];
+ error_ = true;
+ }
+ }
+
+
+ EntryReader entry_reader_; // Read functor for 'EntryType'
+ vector<ifstream*> streams_; // Input streams
+ vector<string> sources_; // and corresponding file names
+ vector<vector<int64> > positions_; // Index of positions for each stream
+ vector<string> keys_; // Lowest unread key for each stream
+ vector<int64> heap_; // Heap containing ID of streams with unread keys
+ int64 current_; // Id of current stream to be read
+ Compare *compare_; // Functor comparing stream IDs for the heap
+ mutable EntryType *entry_; // Pointer to the currently read entry
+ bool error_;
+
+ DISALLOW_COPY_AND_ASSIGN(STTableReader);
+};
+
+
+// String-to-type table header reading function template on the entry header
+// type 'H' having a member function:
+// Read(istream &strm, const string &filename);
+// Checks that 'filename' is an STTable and call the H::Read() on the last
+// entry in the STTable.
+template <class H>
+bool ReadSTTableHeader(const string &filename, H *header) {
+ ifstream strm(filename.c_str(), ifstream::in | ifstream::binary);
+ int32 magic_number = 0, file_version = 0;
+ ReadType(strm, &magic_number);
+ ReadType(strm, &file_version);
+ if (magic_number != kSTTableMagicNumber) {
+ LOG(ERROR) << "ReadSTTableHeader: wrong file type: " << filename;
+ return false;
+ }
+ if (file_version != kSTTableFileVersion) {
+ LOG(ERROR) << "ReadSTTableHeader: wrong file version: " << filename;
+ return false;
+ }
+ int64 i = -1;
+ strm.seekg(-static_cast<int>(sizeof(int64)), ios_base::end);
+ ReadType(strm, &i); // Read number of entries
+ if (!strm) {
+ LOG(ERROR) << "ReadSTTableHeader: error reading file: " << filename;
+ return false;
+ }
+ if (i == 0) return true; // No entry header to read
+ strm.seekg(-2 * static_cast<int>(sizeof(int64)), ios_base::end);
+ ReadType(strm, &i); // Read position for last entry in file
+ strm.seekg(i);
+ string key;
+ ReadType(strm, &key);
+ header->Read(strm, filename + ":" + key);
+ if (!strm) {
+ LOG(ERROR) << "ReadSTTableHeader: error reading file: " << filename;
+ return false;
+ }
+ return true;
+}
+
+bool IsSTTable(const string &filename);
+
+} // namespace fst
+
+#endif // FST_EXTENSIONS_FAR_STTABLE_H_