diff options
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | AUTHORS | 1 | ||||
-rw-r--r-- | Makefile | 14 | ||||
-rw-r--r-- | db/autocompact_test.cc | 118 | ||||
-rw-r--r-- | db/corruption_test.cc | 51 | ||||
-rw-r--r-- | db/db_impl.cc | 41 | ||||
-rw-r--r-- | db/db_impl.h | 9 | ||||
-rw-r--r-- | db/db_iter.cc | 50 | ||||
-rw-r--r-- | db/db_iter.h | 8 | ||||
-rw-r--r-- | db/db_test.cc | 36 | ||||
-rw-r--r-- | db/dbformat.h | 3 | ||||
-rw-r--r-- | db/filename.cc | 9 | ||||
-rw-r--r-- | db/filename.h | 5 | ||||
-rw-r--r-- | db/filename_test.cc | 1 | ||||
-rw-r--r-- | db/repair.cc | 8 | ||||
-rw-r--r-- | db/table_cache.cc | 6 | ||||
-rw-r--r-- | db/version_set.cc | 96 | ||||
-rw-r--r-- | db/version_set.h | 15 | ||||
-rw-r--r-- | doc/impl.html | 2 | ||||
-rw-r--r-- | include/leveldb/db.h | 2 | ||||
-rw-r--r-- | issues/issue200_test.cc | 59 | ||||
-rw-r--r-- | util/arena.cc | 2 | ||||
-rw-r--r-- | util/env_posix.cc | 33 | ||||
-rw-r--r-- | util/random.h | 7 |
24 files changed, 499 insertions, 78 deletions
@@ -6,3 +6,4 @@ build_config.mk *.so.* *_test db_bench +leveldbutil @@ -9,3 +9,4 @@ Sanjay Ghemawat <sanjay@google.com> # Partial list of contributors: Kevin Regan <kevin.d.regan@gmail.com> +Johan Bilien <jobi@litl.com> @@ -31,6 +31,7 @@ TESTHARNESS = ./util/testharness.o $(TESTUTIL) TESTS = \ arena_test \ + autocompact_test \ bloom_test \ c_test \ cache_test \ @@ -43,6 +44,7 @@ TESTS = \ filename_test \ filter_block_test \ issue178_test \ + issue200_test \ log_test \ memenv_test \ skiplist_test \ @@ -70,7 +72,7 @@ SHARED = $(SHARED1) else # Update db.h if you change these. SHARED_MAJOR = 1 -SHARED_MINOR = 12 +SHARED_MINOR = 14 SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT) SHARED2 = $(SHARED1).$(SHARED_MAJOR) SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR) @@ -114,6 +116,9 @@ leveldbutil: db/leveldb_main.o $(LIBOBJECTS) arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) $(LDFLAGS) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +autocompact_test: db/autocompact_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) db/autocompact_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) $(LDFLAGS) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) @@ -150,6 +155,9 @@ filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) issue178_test: issues/issue178_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) $(LDFLAGS) issues/issue178_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +issue200_test: issues/issue200_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) issues/issue200_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) $(LDFLAGS) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) @@ -187,14 +195,14 @@ IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBu mkdir -p ios-x86/$(dir $@) $(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@ mkdir -p ios-arm/$(dir $@) - $(DEVICEROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@ + xcrun -sdk iphoneos $(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@ lipo ios-x86/$@ ios-arm/$@ -create -output $@ .c.o: mkdir -p ios-x86/$(dir $@) $(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@ mkdir -p ios-arm/$(dir $@) - $(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@ + xcrun -sdk iphoneos $(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@ lipo ios-x86/$@ ios-arm/$@ -create -output $@ else diff --git a/db/autocompact_test.cc b/db/autocompact_test.cc new file mode 100644 index 0000000..d20a236 --- /dev/null +++ b/db/autocompact_test.cc @@ -0,0 +1,118 @@ +// Copyright (c) 2013 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/db.h" +#include "db/db_impl.h" +#include "leveldb/cache.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace leveldb { + +class AutoCompactTest { + public: + std::string dbname_; + Cache* tiny_cache_; + Options options_; + DB* db_; + + AutoCompactTest() { + dbname_ = test::TmpDir() + "/autocompact_test"; + tiny_cache_ = NewLRUCache(100); + options_.block_cache = tiny_cache_; + DestroyDB(dbname_, options_); + options_.create_if_missing = true; + options_.compression = kNoCompression; + ASSERT_OK(DB::Open(options_, dbname_, &db_)); + } + + ~AutoCompactTest() { + delete db_; + DestroyDB(dbname_, Options()); + delete tiny_cache_; + } + + std::string Key(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "key%06d", i); + return std::string(buf); + } + + uint64_t Size(const Slice& start, const Slice& limit) { + Range r(start, limit); + uint64_t size; + db_->GetApproximateSizes(&r, 1, &size); + return size; + } + + void DoReads(int n); +}; + +static const int kValueSize = 200 * 1024; +static const int kTotalSize = 100 * 1024 * 1024; +static const int kCount = kTotalSize / kValueSize; + +// Read through the first n keys repeatedly and check that they get +// compacted (verified by checking the size of the key space). +void AutoCompactTest::DoReads(int n) { + std::string value(kValueSize, 'x'); + DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); + + // Fill database + for (int i = 0; i < kCount; i++) { + ASSERT_OK(db_->Put(WriteOptions(), Key(i), value)); + } + ASSERT_OK(dbi->TEST_CompactMemTable()); + + // Delete everything + for (int i = 0; i < kCount; i++) { + ASSERT_OK(db_->Delete(WriteOptions(), Key(i))); + } + ASSERT_OK(dbi->TEST_CompactMemTable()); + + // Get initial measurement of the space we will be reading. + const int64_t initial_size = Size(Key(0), Key(n)); + const int64_t initial_other_size = Size(Key(n), Key(kCount)); + + // Read until size drops significantly. + std::string limit_key = Key(n); + for (int read = 0; true; read++) { + ASSERT_LT(read, 100) << "Taking too long to compact"; + Iterator* iter = db_->NewIterator(ReadOptions()); + for (iter->SeekToFirst(); + iter->Valid() && iter->key().ToString() < limit_key; + iter->Next()) { + // Drop data + } + delete iter; + // Wait a little bit to allow any triggered compactions to complete. + Env::Default()->SleepForMicroseconds(1000000); + uint64_t size = Size(Key(0), Key(n)); + fprintf(stderr, "iter %3d => %7.3f MB [other %7.3f MB]\n", + read+1, size/1048576.0, Size(Key(n), Key(kCount))/1048576.0); + if (size <= initial_size/10) { + break; + } + } + + // Verify that the size of the key space not touched by the reads + // is pretty much unchanged. + const int64_t final_other_size = Size(Key(n), Key(kCount)); + ASSERT_LE(final_other_size, initial_other_size + 1048576); + ASSERT_GE(final_other_size, initial_other_size/5 - 1048576); +} + +TEST(AutoCompactTest, ReadAll) { + DoReads(kCount); +} + +TEST(AutoCompactTest, ReadHalf) { + DoReads(kCount/2); +} + +} // namespace leveldb + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 31b2d5f..b37ffdf 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -35,6 +35,7 @@ class CorruptionTest { CorruptionTest() { tiny_cache_ = NewLRUCache(100); options_.env = &env_; + options_.block_cache = tiny_cache_; dbname_ = test::TmpDir() + "/db_test"; DestroyDB(dbname_, options_); @@ -50,17 +51,14 @@ class CorruptionTest { delete tiny_cache_; } - Status TryReopen(Options* options = NULL) { + Status TryReopen() { delete db_; db_ = NULL; - Options opt = (options ? *options : options_); - opt.env = &env_; - opt.block_cache = tiny_cache_; - return DB::Open(opt, dbname_, &db_); + return DB::Open(options_, dbname_, &db_); } - void Reopen(Options* options = NULL) { - ASSERT_OK(TryReopen(options)); + void Reopen() { + ASSERT_OK(TryReopen()); } void RepairDB() { @@ -92,6 +90,10 @@ class CorruptionTest { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { uint64_t key; Slice in(iter->key()); + if (in == "" || in == "~") { + // Ignore boundary keys. + continue; + } if (!ConsumeDecimalNumber(&in, &key) || !in.empty() || key < next_expected) { @@ -233,7 +235,7 @@ TEST(CorruptionTest, TableFile) { dbi->TEST_CompactRange(1, NULL, NULL); Corrupt(kTableFile, 100, 1); - Check(99, 99); + Check(90, 99); } TEST(CorruptionTest, TableFileIndexData) { @@ -299,7 +301,7 @@ TEST(CorruptionTest, CompactionInputError) { ASSERT_EQ(1, Property("leveldb.num-files-at-level" + NumberToString(last))); Corrupt(kTableFile, 100, 1); - Check(9, 9); + Check(5, 9); // Force compactions by writing lots of values Build(10000); @@ -307,32 +309,23 @@ TEST(CorruptionTest, CompactionInputError) { } TEST(CorruptionTest, CompactionInputErrorParanoid) { - Options options; - options.paranoid_checks = true; - options.write_buffer_size = 1048576; - Reopen(&options); + options_.paranoid_checks = true; + options_.write_buffer_size = 512 << 10; + Reopen(); DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); - // Fill levels >= 1 so memtable compaction outputs to level 1 - for (int level = 1; level < config::kNumLevels; level++) { - dbi->Put(WriteOptions(), "", "begin"); - dbi->Put(WriteOptions(), "~", "end"); + // Make multiple inputs so we need to compact. + for (int i = 0; i < 2; i++) { + Build(10); dbi->TEST_CompactMemTable(); + Corrupt(kTableFile, 100, 1); + env_.SleepForMicroseconds(100000); } + dbi->CompactRange(NULL, NULL); - Build(10); - dbi->TEST_CompactMemTable(); - ASSERT_EQ(1, Property("leveldb.num-files-at-level0")); - - Corrupt(kTableFile, 100, 1); - Check(9, 9); - - // Write must eventually fail because of corrupted table - Status s; + // Write must fail because of corrupted table std::string tmp1, tmp2; - for (int i = 0; i < 10000 && s.ok(); i++) { - s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2)); - } + Status s = db_->Put(WriteOptions(), Key(5, &tmp1), Value(5, &tmp2)); ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db"; } diff --git a/db/db_impl.cc b/db/db_impl.cc index 395d317..fa13510 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -113,14 +113,14 @@ Options SanitizeOptions(const std::string& dbname, return result; } -DBImpl::DBImpl(const Options& options, const std::string& dbname) - : env_(options.env), - internal_comparator_(options.comparator), - internal_filter_policy_(options.filter_policy), - options_(SanitizeOptions( - dbname, &internal_comparator_, &internal_filter_policy_, options)), - owns_info_log_(options_.info_log != options.info_log), - owns_cache_(options_.block_cache != options.block_cache), +DBImpl::DBImpl(const Options& raw_options, const std::string& dbname) + : env_(raw_options.env), + internal_comparator_(raw_options.comparator), + internal_filter_policy_(raw_options.filter_policy), + options_(SanitizeOptions(dbname, &internal_comparator_, + &internal_filter_policy_, raw_options)), + owns_info_log_(options_.info_log != raw_options.info_log), + owns_cache_(options_.block_cache != raw_options.block_cache), dbname_(dbname), db_lock_(NULL), shutting_down_(NULL), @@ -130,6 +130,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname) logfile_(NULL), logfile_number_(0), log_(NULL), + seed_(0), tmp_batch_(new WriteBatch), bg_compaction_scheduled_(false), manual_compaction_(NULL), @@ -138,7 +139,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname) has_imm_.Release_Store(NULL); // Reserve ten files or so for other uses and give the rest to TableCache. - const int table_cache_size = options.max_open_files - kNumNonTableCacheFiles; + const int table_cache_size = options_.max_open_files - kNumNonTableCacheFiles; table_cache_ = new TableCache(dbname_, &options_, table_cache_size); versions_ = new VersionSet(dbname_, &options_, table_cache_, @@ -1027,7 +1028,8 @@ static void CleanupIteratorState(void* arg1, void* arg2) { } // namespace Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, - SequenceNumber* latest_snapshot) { + SequenceNumber* latest_snapshot, + uint32_t* seed) { IterState* cleanup = new IterState; mutex_.Lock(); *latest_snapshot = versions_->LastSequence(); @@ -1051,13 +1053,15 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, cleanup->version = versions_->current(); internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, NULL); + *seed = ++seed_; mutex_.Unlock(); return internal_iter; } Iterator* DBImpl::TEST_NewInternalIterator() { SequenceNumber ignored; - return NewInternalIterator(ReadOptions(), &ignored); + uint32_t ignored_seed; + return NewInternalIterator(ReadOptions(), &ignored, &ignored_seed); } int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() { @@ -1114,12 +1118,21 @@ Status DBImpl::Get(const ReadOptions& options, Iterator* DBImpl::NewIterator(const ReadOptions& options) { SequenceNumber latest_snapshot; - Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot); + uint32_t seed; + Iterator* iter = NewInternalIterator(options, &latest_snapshot, &seed); return NewDBIterator( - &dbname_, env_, user_comparator(), internal_iter, + this, user_comparator(), iter, (options.snapshot != NULL ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_ - : latest_snapshot)); + : latest_snapshot), + seed); +} + +void DBImpl::RecordReadSample(Slice key) { + MutexLock l(&mutex_); + if (versions_->current()->RecordReadSample(key)) { + MaybeScheduleCompaction(); + } } const Snapshot* DBImpl::GetSnapshot() { diff --git a/db/db_impl.h b/db/db_impl.h index 3c8d711..75fd30a 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -59,13 +59,19 @@ class DBImpl : public DB { // file at a level >= 1. int64_t TEST_MaxNextLevelOverlappingBytes(); + // Record a sample of bytes read at the specified internal key. + // Samples are taken approximately once every config::kReadBytesPeriod + // bytes. + void RecordReadSample(Slice key); + private: friend class DB; struct CompactionState; struct Writer; Iterator* NewInternalIterator(const ReadOptions&, - SequenceNumber* latest_snapshot); + SequenceNumber* latest_snapshot, + uint32_t* seed); Status NewDB(); @@ -135,6 +141,7 @@ class DBImpl : public DB { WritableFile* logfile_; uint64_t logfile_number_; log::Writer* log_; + uint32_t seed_; // For sampling. // Queue of writers. std::deque<Writer*> writers_; diff --git a/db/db_iter.cc b/db/db_iter.cc index 87dca2d..3b2035e 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -5,12 +5,14 @@ #include "db/db_iter.h" #include "db/filename.h" +#include "db/db_impl.h" #include "db/dbformat.h" #include "leveldb/env.h" #include "leveldb/iterator.h" #include "port/port.h" #include "util/logging.h" #include "util/mutexlock.h" +#include "util/random.h" namespace leveldb { @@ -46,15 +48,16 @@ class DBIter: public Iterator { kReverse }; - DBIter(const std::string* dbname, Env* env, - const Comparator* cmp, Iterator* iter, SequenceNumber s) - : dbname_(dbname), - env_(env), + DBIter(DBImpl* db, const Comparator* cmp, Iterator* iter, SequenceNumber s, + uint32_t seed) + : db_(db), user_comparator_(cmp), iter_(iter), sequence_(s), direction_(kForward), - valid_(false) { + valid_(false), + rnd_(seed), + bytes_counter_(RandomPeriod()) { } virtual ~DBIter() { delete iter_; @@ -100,8 +103,12 @@ class DBIter: public Iterator { } } - const std::string* const dbname_; - Env* const env_; + // Pick next gap with average value of config::kReadBytesPeriod. + ssize_t RandomPeriod() { + return rnd_.Uniform(2*config::kReadBytesPeriod); + } + + DBImpl* db_; const Comparator* const user_comparator_; Iterator* const iter_; SequenceNumber const sequence_; @@ -112,13 +119,23 @@ class DBIter: public Iterator { Direction direction_; bool valid_; + Random rnd_; + ssize_t bytes_counter_; + // No copying allowed DBIter(const DBIter&); void operator=(const DBIter&); }; inline bool DBIter::ParseKey(ParsedInternalKey* ikey) { - if (!ParseInternalKey(iter_->key(), ikey)) { + Slice k = iter_->key(); + ssize_t n = k.size() + iter_->value().size(); + bytes_counter_ -= n; + while (bytes_counter_ < 0) { + bytes_counter_ += RandomPeriod(); + db_->RecordReadSample(k); + } + if (!ParseInternalKey(k, ikey)) { status_ = Status::Corruption("corrupted internal key in DBIter"); return false; } else { @@ -144,12 +161,13 @@ void DBIter::Next() { saved_key_.clear(); return; } + // saved_key_ already contains the key to skip past. + } else { + // Store in saved_key_ the current key so we skip it below. + SaveKey(ExtractUserKey(iter_->key()), &saved_key_); } - // Temporarily use saved_key_ as storage for key to skip. - std::string* skip = &saved_key_; - SaveKey(ExtractUserKey(iter_->key()), skip); - FindNextUserEntry(true, skip); + FindNextUserEntry(true, &saved_key_); } void DBIter::FindNextUserEntry(bool skipping, std::string* skip) { @@ -288,12 +306,12 @@ void DBIter::SeekToLast() { } // anonymous namespace Iterator* NewDBIterator( - const std::string* dbname, - Env* env, + DBImpl* db, const Comparator* user_key_comparator, Iterator* internal_iter, - const SequenceNumber& sequence) { - return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence); + SequenceNumber sequence, + uint32_t seed) { + return new DBIter(db, user_key_comparator, internal_iter, sequence, seed); } } // namespace leveldb diff --git a/db/db_iter.h b/db/db_iter.h index d9e1b17..04927e9 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -11,15 +11,17 @@ namespace leveldb { +class DBImpl; + // Return a new iterator that converts internal keys (yielded by // "*internal_iter") that were live at the specified "sequence" number // into appropriate user keys. extern Iterator* NewDBIterator( - const std::string* dbname, - Env* env, + DBImpl* db, const Comparator* user_key_comparator, Iterator* internal_iter, - const SequenceNumber& sequence); + SequenceNumber sequence, + uint32_t seed); } // namespace leveldb diff --git a/db/db_test.cc b/db/db_test.cc index 49aae04..848a038 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -147,7 +147,7 @@ class SpecialEnv : public EnvWrapper { Status s = target()->NewWritableFile(f, r); if (s.ok()) { - if (strstr(f.c_str(), ".sst") != NULL) { + if (strstr(f.c_str(), ".ldb") != NULL) { *r = new SSTableFile(this, *r); } else if (strstr(f.c_str(), "MANIFEST") != NULL) { *r = new ManifestFile(this, *r); @@ -484,6 +484,24 @@ class DBTest { } return false; } + + // Returns number of files renamed. + int RenameLDBToSST() { + std::vector<std::string> filenames; + ASSERT_OK(env_->GetChildren(dbname_, &filenames)); + uint64_t number; + FileType type; + int files_renamed = 0; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type) && type == kTableFile) { + const std::string from = TableFileName(dbname_, number); + const std::string to = SSTTableFileName(dbname_, number); + ASSERT_OK(env_->RenameFile(from, to)); + files_renamed++; + } + } + return files_renamed; + } }; TEST(DBTest, Empty) { @@ -1632,6 +1650,22 @@ TEST(DBTest, MissingSSTFile) { << s.ToString(); } +TEST(DBTest, StillReadSST) { + ASSERT_OK(Put("foo", "bar")); + ASSERT_EQ("bar", Get("foo")); + + // Dump the memtable to disk. + dbfull()->TEST_CompactMemTable(); + ASSERT_EQ("bar", Get("foo")); + Close(); + ASSERT_GT(RenameLDBToSST(), 0); + Options options = CurrentOptions(); + options.paranoid_checks = true; + Status s = TryReopen(&options); + ASSERT_TRUE(s.ok()); + ASSERT_EQ("bar", Get("foo")); +} + TEST(DBTest, FilesDeletedAfterCompaction) { ASSERT_OK(Put("foo", "v2")); Compact("a", "z"); diff --git a/db/dbformat.h b/db/dbformat.h index f7f64da..5d8a032 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -38,6 +38,9 @@ static const int kL0_StopWritesTrigger = 12; // space if the same key space is being repeatedly overwritten. static const int kMaxMemCompactLevel = 2; +// Approximate gap in bytes between samples of data read during iteration. +static const int kReadBytesPeriod = 1048576; + } // namespace config class InternalKey; diff --git a/db/filename.cc b/db/filename.cc index 3c4d49f..da32946 100644 --- a/db/filename.cc +++ b/db/filename.cc @@ -31,6 +31,11 @@ std::string LogFileName(const std::string& name, uint64_t number) { std::string TableFileName(const std::string& name, uint64_t number) { assert(number > 0); + return MakeFileName(name, number, "ldb"); +} + +std::string SSTTableFileName(const std::string& name, uint64_t number) { + assert(number > 0); return MakeFileName(name, number, "sst"); } @@ -71,7 +76,7 @@ std::string OldInfoLogFileName(const std::string& dbname) { // dbname/LOG // dbname/LOG.old // dbname/MANIFEST-[0-9]+ -// dbname/[0-9]+.(log|sst) +// dbname/[0-9]+.(log|sst|ldb) bool ParseFileName(const std::string& fname, uint64_t* number, FileType* type) { @@ -106,7 +111,7 @@ bool ParseFileName(const std::string& fname, Slice suffix = rest; if (suffix == Slice(".log")) { *type = kLogFile; - } else if (suffix == Slice(".sst")) { + } else if (suffix == Slice(".sst") || suffix == Slice(".ldb")) { *type = kTableFile; } else if (suffix == Slice(".dbtmp")) { *type = kTempFile; diff --git a/db/filename.h b/db/filename.h index d5d09b1..87a7526 100644 --- a/db/filename.h +++ b/db/filename.h @@ -37,6 +37,11 @@ extern std::string LogFileName(const std::string& dbname, uint64_t number); // "dbname". extern std::string TableFileName(const std::string& dbname, uint64_t number); +// Return the legacy file name for an sstable with the specified number +// in the db named by "dbname". The result will be prefixed with +// "dbname". +extern std::string SSTTableFileName(const std::string& dbname, uint64_t number); + // Return the name of the descriptor file for the db named by // "dbname" and the specified incarnation number. The result will be // prefixed with "dbname". diff --git a/db/filename_test.cc b/db/filename_test.cc index 5a26da4..a32556d 100644 --- a/db/filename_test.cc +++ b/db/filename_test.cc @@ -27,6 +27,7 @@ TEST(FileNameTest, Parse) { { "100.log", 100, kLogFile }, { "0.log", 0, kLogFile }, { "0.sst", 0, kTableFile }, + { "0.ldb", 0, kTableFile }, { "CURRENT", 0, kCurrentFile }, { "LOCK", 0, kDBLockFile }, { "MANIFEST-2", 2, kDescriptorFile }, diff --git a/db/repair.cc b/db/repair.cc index 022d52f..dc93fb8 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -263,6 +263,12 @@ class Repairer { std::string fname = TableFileName(dbname_, t->meta.number); int counter = 0; Status status = env_->GetFileSize(fname, &t->meta.file_size); + if (!status.ok()) { + fname = SSTTableFileName(dbname_, t->meta.number); + Status s2 = env_->GetFileSize(fname, &t->meta.file_size); + if (s2.ok()) + status = Status::OK(); + } if (status.ok()) { Iterator* iter = table_cache_->NewIterator( ReadOptions(), t->meta.number, t->meta.file_size); @@ -293,6 +299,8 @@ class Repairer { } delete iter; } + // If there was trouble opening an .sst file this will report that the .ldb + // file was not found, which is kind of lame but shouldn't happen often. Log(options_.info_log, "Table #%llu: %d entries %s", (unsigned long long) t->meta.number, counter, diff --git a/db/table_cache.cc b/db/table_cache.cc index 497db27..e3d82cd 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -54,6 +54,12 @@ Status TableCache::FindTable(uint64_t file_number, uint64_t file_size, RandomAccessFile* file = NULL; Table* table = NULL; s = env_->NewRandomAccessFile(fname, &file); + if (!s.ok()) { + std::string old_fname = SSTTableFileName(dbname_, file_number); + if (env_->NewRandomAccessFile(old_fname, &file).ok()) { + s = Status::OK(); + } + } if (s.ok()) { s = Table::Open(*options_, file, file_size, &table); } diff --git a/db/version_set.cc b/db/version_set.cc index 4fd1dde..66d73be 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -289,6 +289,51 @@ static bool NewestFirst(FileMetaData* a, FileMetaData* b) { return a->number > b->number; } +void Version::ForEachOverlapping(Slice user_key, Slice internal_key, + void* arg, + bool (*func)(void*, int, FileMetaData*)) { + // TODO(sanjay): Change Version::Get() to use this function. + const Comparator* ucmp = vset_->icmp_.user_comparator(); + + // Search level-0 in order from newest to oldest. + std::vector<FileMetaData*> tmp; + tmp.reserve(files_[0].size()); + for (uint32_t i = 0; i < files_[0].size(); i++) { + FileMetaData* f = files_[0][i]; + if (ucmp->Compare(user_key, f->smallest.user_key()) >= 0 && + ucmp->Compare(user_key, f->largest.user_key()) <= 0) { + tmp.push_back(f); + } + } + if (!tmp.empty()) { + std::sort(tmp.begin(), tmp.end(), NewestFirst); + for (uint32_t i = 0; i < tmp.size(); i++) { + if (!(*func)(arg, 0, tmp[i])) { + return; + } + } + } + + // Search other levels. + for (int level = 1; level < config::kNumLevels; level++) { + size_t num_files = files_[level].size(); + if (num_files == 0) continue; + + // Binary search to find earliest index whose largest key >= internal_key. + uint32_t index = FindFile(vset_->icmp_, files_[level], internal_key); + if (index < num_files) { + FileMetaData* f = files_[level][index]; + if (ucmp->Compare(user_key, f->smallest.user_key()) < 0) { + // All of "f" is past any data for user_key + } else { + if (!(*func)(arg, level, f)) { + return; + } + } + } + } +} + Status Version::Get(const ReadOptions& options, const LookupKey& k, std::string* value, @@ -401,6 +446,44 @@ bool Version::UpdateStats(const GetStats& stats) { return false; } +bool Version::RecordReadSample(Slice internal_key) { + ParsedInternalKey ikey; + if (!ParseInternalKey(internal_key, &ikey)) { + return false; + } + + struct State { + GetStats stats; // Holds first matching file + int matches; + + static bool Match(void* arg, int level, FileMetaData* f) { + State* state = reinterpret_cast<State*>(arg); + state->matches++; + if (state->matches == 1) { + // Remember first match. + state->stats.seek_file = f; + state->stats.seek_file_level = level; + } + // We can stop iterating once we have a second match. + return state->matches < 2; + } + }; + + State state; + state.matches = 0; + ForEachOverlapping(ikey.user_key, internal_key, &state, &State::Match); + + // Must have at least two matches since we want to merge across + // files. But what if we have a single file that contains many + // overwrites and deletions? Should we have another mechanism for + // finding such files? + if (state.matches >= 2) { + // 1MB cost is about 1 seek (see comment in Builder::Apply). + return UpdateStats(state.stats); + } + return false; +} + void Version::Ref() { ++refs_; } @@ -435,10 +518,13 @@ int Version::PickLevelForMemTableOutput( if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) { break; } - GetOverlappingInputs(level + 2, &start, &limit, &overlaps); - const int64_t sum = TotalFileSize(overlaps); - if (sum > kMaxGrandParentOverlapBytes) { - break; + if (level + 2 < config::kNumLevels) { + // Check that file does not overlap too many grandparent bytes. + GetOverlappingInputs(level + 2, &start, &limit, &overlaps); + const int64_t sum = TotalFileSize(overlaps); + if (sum > kMaxGrandParentOverlapBytes) { + break; + } } level++; } @@ -452,6 +538,8 @@ void Version::GetOverlappingInputs( const InternalKey* begin, const InternalKey* end, std::vector<FileMetaData*>* inputs) { + assert(level >= 0); + assert(level < config::kNumLevels); inputs->clear(); Slice user_begin, user_end; if (begin != NULL) { diff --git a/db/version_set.h b/db/version_set.h index 9d084fd..20de0e2 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -78,6 +78,12 @@ class Version { // REQUIRES: lock is held bool UpdateStats(const GetStats& stats); + // Record a sample of bytes read at the specified internal key. + // Samples are taken approximately once every config::kReadBytesPeriod + // bytes. Returns true if a new compaction may need to be triggered. + // REQUIRES: lock is held + bool RecordReadSample(Slice key); + // Reference count management (so Versions do not disappear out from // under live iterators) void Ref(); @@ -114,6 +120,15 @@ class Version { class LevelFileNumIterator; Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const; + // Call func(arg, level, f) for every file that overlaps user_key in + // order from newest to oldest. If an invocation of func returns + // false, makes no more calls. + // + // REQUIRES: user portion of internal_key == user_key. + void ForEachOverlapping(Slice user_key, Slice internal_key, + void* arg, + bool (*func)(void*, int, FileMetaData*)); + VersionSet* vset_; // VersionSet to which this Version belongs Version* next_; // Next version in linked list Version* prev_; // Previous version in linked list diff --git a/doc/impl.html b/doc/impl.html index e870795..28817fe 100644 --- a/doc/impl.html +++ b/doc/impl.html @@ -11,7 +11,7 @@ The implementation of leveldb is similar in spirit to the representation of a single -<a href="http://labs.google.com/papers/bigtable.html"> +<a href="http://research.google.com/archive/bigtable.html"> Bigtable tablet (section 5.3)</a>. However the organization of the files that make up the representation is somewhat different and is explained below. diff --git a/include/leveldb/db.h b/include/leveldb/db.h index da8b11a..259a81f 100644 --- a/include/leveldb/db.h +++ b/include/leveldb/db.h @@ -14,7 +14,7 @@ namespace leveldb { // Update Makefile if you change these static const int kMajorVersion = 1; -static const int kMinorVersion = 12; +static const int kMinorVersion = 14; struct Options; struct ReadOptions; diff --git a/issues/issue200_test.cc b/issues/issue200_test.cc new file mode 100644 index 0000000..1cec79f --- /dev/null +++ b/issues/issue200_test.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2013 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// Test for issue 200: when iterator switches direction from backward +// to forward, the current key can be yielded unexpectedly if a new +// mutation has been added just before the current key. + +#include "leveldb/db.h" +#include "util/testharness.h" + +namespace leveldb { + +class Issue200 { }; + +TEST(Issue200, Test) { + // Get rid of any state from an old run. + std::string dbpath = test::TmpDir() + "/leveldb_issue200_test"; + DestroyDB(dbpath, Options()); + + DB *db; + Options options; + options.create_if_missing = true; + ASSERT_OK(DB::Open(options, dbpath, &db)); + + WriteOptions write_options; + ASSERT_OK(db->Put(write_options, "1", "b")); + ASSERT_OK(db->Put(write_options, "2", "c")); + ASSERT_OK(db->Put(write_options, "3", "d")); + ASSERT_OK(db->Put(write_options, "4", "e")); + ASSERT_OK(db->Put(write_options, "5", "f")); + + ReadOptions read_options; + Iterator *iter = db->NewIterator(read_options); + + // Add an element that should not be reflected in the iterator. + ASSERT_OK(db->Put(write_options, "25", "cd")); + + iter->Seek("5"); + ASSERT_EQ(iter->key().ToString(), "5"); + iter->Prev(); + ASSERT_EQ(iter->key().ToString(), "4"); + iter->Prev(); + ASSERT_EQ(iter->key().ToString(), "3"); + iter->Next(); + ASSERT_EQ(iter->key().ToString(), "4"); + iter->Next(); + ASSERT_EQ(iter->key().ToString(), "5"); + + delete iter; + delete db; + DestroyDB(dbpath, options); +} + +} // namespace leveldb + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/util/arena.cc b/util/arena.cc index 9551d6a..9367f71 100644 --- a/util/arena.cc +++ b/util/arena.cc @@ -40,7 +40,7 @@ char* Arena::AllocateFallback(size_t bytes) { } char* Arena::AllocateAligned(size_t bytes) { - const int align = sizeof(void*); // We'll align to pointer size + const int align = (sizeof(void*) > 8) ? sizeof(void*) : 8; assert((align & (align-1)) == 0); // Pointer size should be a power of 2 size_t current_mod = reinterpret_cast<uintptr_t>(alloc_ptr_) & (align-1); size_t slop = (current_mod == 0 ? 0 : align - current_mod); diff --git a/util/env_posix.cc b/util/env_posix.cc index a3f197d..3e2925d 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -319,8 +319,39 @@ class PosixMmapFile : public WritableFile { return Status::OK(); } - virtual Status Sync() { + Status SyncDirIfManifest() { + const char* f = filename_.c_str(); + const char* sep = strrchr(f, '/'); + Slice basename; + std::string dir; + if (sep == NULL) { + dir = "."; + basename = f; + } else { + dir = std::string(f, sep - f); + basename = sep + 1; + } Status s; + if (basename.starts_with("MANIFEST")) { + int fd = open(dir.c_str(), O_RDONLY); + if (fd < 0) { + s = IOError(dir, errno); + } else { + if (fsync(fd) < 0) { + s = IOError(dir, errno); + } + close(fd); + } + } + return s; + } + + virtual Status Sync() { + // Ensure new files referred to by the manifest are in the filesystem. + Status s = SyncDirIfManifest(); + if (!s.ok()) { + return s; + } if (pending_sync_) { // Some unmapped data was not synced diff --git a/util/random.h b/util/random.h index 0753824..ddd51b1 100644 --- a/util/random.h +++ b/util/random.h @@ -16,7 +16,12 @@ class Random { private: uint32_t seed_; public: - explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { } + explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { + // Avoid bad seeds. + if (seed_ == 0 || seed_ == 2147483647L) { + seed_ = 1; + } + } uint32_t Next() { static const uint32_t M = 2147483647L; // 2^31-1 static const uint64_t A = 16807; // bits 14, 8, 7, 5, 2, 1, 0 |