aboutsummaryrefslogtreecommitdiff
path: root/java/com/google/setfilters/cuckoofilter/CuckooFilter.java
diff options
context:
space:
mode:
Diffstat (limited to 'java/com/google/setfilters/cuckoofilter/CuckooFilter.java')
-rw-r--r--java/com/google/setfilters/cuckoofilter/CuckooFilter.java278
1 files changed, 278 insertions, 0 deletions
diff --git a/java/com/google/setfilters/cuckoofilter/CuckooFilter.java b/java/com/google/setfilters/cuckoofilter/CuckooFilter.java
new file mode 100644
index 0000000..b37840b
--- /dev/null
+++ b/java/com/google/setfilters/cuckoofilter/CuckooFilter.java
@@ -0,0 +1,278 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package com.google.setfilters.cuckoofilter;
+
+import com.google.common.hash.Funnel;
+import com.google.common.hash.HashCode;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Optional;
+import java.util.Random;
+
+/**
+ * A space efficient, probabilistic multiset data structure that supports membership check,
+ * insertion, and deletion of the elements.
+ *
+ * <p>Cuckoo filter enables tradeoffs between its space efficiency and the false positive
+ * probability of the membership check.
+ *
+ * <p>See the original paper https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf for more
+ * details.
+ *
+ * <p>This class is not thread-safe.
+ */
+public final class CuckooFilter<T> {
+ private final CuckooFilterConfig config;
+ private final CuckooFilterTable table;
+ private final Funnel<? super T> funnel;
+ private final Random random;
+
+ /** Counts the total number of elements in the cuckoo filter. */
+ private long count;
+
+ /** Instantiates a new cuckoo filter. */
+ public static <T> CuckooFilter<T> createNew(CuckooFilterConfig config, Funnel<? super T> funnel) {
+ Random random = new Random();
+ CuckooFilterTable table =
+ CuckooFilterTable.create(config.size(), config.useSpaceOptimization(), random);
+ return new CuckooFilter<T>(config, table, funnel, random);
+ }
+
+ /**
+ * Instantiates a cuckoo filter from serialized cuckoo filter table.
+ *
+ * <p>Note that {@link SerializedCuckooFilterTable} does not contain any data on {@link
+ * CuckooFilterConfig.HashFunction}, {@link CuckooFilterConfig.Strategy}, or {@link Funnel} used,
+ * so it is up to the user to supply appropriate hash function, strategy, and funnel that were
+ * used to generate the {@link SerializedCuckooFilterTable}.
+ */
+ public static <T> CuckooFilter<T> createFromSerializedTable(
+ SerializedCuckooFilterTable serializedTable,
+ CuckooFilterConfig.HashFunction hashFunction,
+ CuckooFilterConfig.Strategy strategy,
+ Funnel<? super T> funnel) {
+ Random random = new Random();
+ CuckooFilterTable table = CuckooFilterTable.createFromSerialization(serializedTable, random);
+ return new CuckooFilter<T>(
+ CuckooFilterConfig.newBuilder()
+ .setSize(table.size())
+ .setHashFunction(hashFunction)
+ .setStrategy(strategy)
+ .build(),
+ table,
+ funnel,
+ random);
+ }
+
+ private CuckooFilter(
+ CuckooFilterConfig config, CuckooFilterTable table, Funnel<? super T> funnel, Random random) {
+ this.config = config;
+ this.table = table;
+ this.funnel = funnel;
+ this.random = random;
+ count = 0;
+ }
+
+ /**
+ * Returns true if {@code element} is in the cuckoo filter.
+ *
+ * <p>By the probabilistic nature of the cuckoo filter data structure, this method may return a
+ * false positive result. In other words, this method may incorrectly return true for an element
+ * that was actually never inserted. This probability can depend on various factors, including the
+ * size of the cuckoo filter and the hash function used.
+ *
+ * <p>However, it is guaranteed that this method never returns a false negative result, as long as
+ * {@code delete} method is called on an element that exists in the filter. Please see {@code
+ * delete} method for more details.
+ */
+ public boolean contains(T element) {
+ HashCode hash = config.hashFunction().hash(element, funnel);
+ long fingerprint =
+ config.strategy().computeFingerprint(hash, config.size().fingerprintLength());
+ int bucketIndex = config.strategy().computeBucketIndex(hash, config.size().bucketCount());
+ int otherBucketIndex =
+ config
+ .strategy()
+ .computeOtherBucketIndex(
+ fingerprint, bucketIndex, config.size().bucketCount(), config.hashFunction());
+ return table.contains(bucketIndex, fingerprint)
+ || table.contains(otherBucketIndex, fingerprint);
+ }
+
+ /**
+ * Inserts {@code element} to the cuckoo filter, returning true if the element was inserted
+ * successfully.
+ *
+ * <p>Insertion of {@code element} will fail if there is no room for {@code element}. Note that
+ * even when the insertion of {@code element} fails, it is possible for another element to be
+ * inserted successfully. Even then, the insertion failure should be a good indicator that the
+ * filter is getting close to its maximum capacity.
+ */
+ public boolean insert(T element) {
+ HashCode hash = config.hashFunction().hash(element, funnel);
+ long fingerprint =
+ config.strategy().computeFingerprint(hash, config.size().fingerprintLength());
+ int bucketIndex = config.strategy().computeBucketIndex(hash, config.size().bucketCount());
+ int otherBucketIndex =
+ config
+ .strategy()
+ .computeOtherBucketIndex(
+ fingerprint, bucketIndex, config.size().bucketCount(), config.hashFunction());
+
+ // First attempt to insert the fingerprint to one of the two assigned buckets.
+ if (attemptInsertion(fingerprint, bucketIndex, otherBucketIndex)) {
+ count++;
+ return true;
+ }
+
+ // If both buckets are full, execute insertion with repeated replacements algorithm.
+ int startBucketIndex = (random.nextInt(2) == 0) ? bucketIndex : otherBucketIndex;
+ boolean inserted = insertWithRepeatedReplacements(fingerprint, startBucketIndex);
+ if (inserted) {
+ count++;
+ }
+ return inserted;
+ }
+
+ /**
+ * Deletes {@code element} from the cuckoo filter, returning true if the element was deleted
+ * successfully.
+ *
+ * <p>It is critical for {@code delete} to be called on an already existing element. Otherwise,
+ * the filter may incorrectly delete a wrong element. When this happens, it is possible for {@code
+ * contains} method to return a false negative result.
+ */
+ public boolean delete(T element) {
+ HashCode hash = config.hashFunction().hash(element, funnel);
+ long fingerprint =
+ config.strategy().computeFingerprint(hash, config.size().fingerprintLength());
+ int bucketIndex = config.strategy().computeBucketIndex(hash, config.size().bucketCount());
+ int otherBucketIndex =
+ config
+ .strategy()
+ .computeOtherBucketIndex(
+ fingerprint, bucketIndex, config.size().bucketCount(), config.hashFunction());
+ boolean deleted =
+ table.delete(bucketIndex, fingerprint) || table.delete(otherBucketIndex, fingerprint);
+ if (deleted) {
+ count--;
+ }
+ return deleted;
+ }
+
+ /** Returns the size of the cuckoo filter. */
+ public CuckooFilterConfig.Size size() {
+ return config.size();
+ }
+
+ /** Returns the count of the elements in the cuckoo filter. */
+ public long count() {
+ return count;
+ }
+
+ /**
+ * Returns the ratio of the total number of elements in the cuckoo filter and the theoretical max
+ * capacity.
+ *
+ * <p>The returned value is in range [0, 1].
+ */
+ public double load() {
+ return count / ((double) config.size().bucketCount() * config.size().bucketCapacity());
+ }
+
+ /**
+ * Serializes the state of the cuckoo filter table.
+ *
+ * <p>Note that this method does not serialize hash function, strategy, and funnel. When
+ * instantiating a cuckoo filter from the returned {@link SerializedCuckooFilterTable}, it is up
+ * to the user to supply appropriate hash function, strategy, and funnel that were used.
+ */
+ public SerializedCuckooFilterTable serializeTable() {
+ return table.serialize();
+ }
+
+ /**
+ * Attempts to insert {@code fingerprint} to one of the buckets with indices {@code bucketIndex}
+ * and {@code otherBucketIndex}, returning true when successful. Returns false if both buckets are
+ * full and the insertion failed.
+ */
+ private boolean attemptInsertion(long fingerprint, int bucketIndex, int otherBucketIndex) {
+ if (!table.isFull(bucketIndex)) {
+ table.insertWithReplacement(bucketIndex, fingerprint);
+ return true;
+ }
+ if (!table.isFull(otherBucketIndex)) {
+ table.insertWithReplacement(otherBucketIndex, fingerprint);
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Randomly traverses the cuckoo graph to find an available bucket for insertion.
+ *
+ * <p>At a high level, this algorithm starts at vertex {@code bucketIndex} and performs a random
+ * walk of length at most {@link CuckooFilterConfig.Strategy#maxReplacementCount}. If an available
+ * bucket is found, the algorithm "pushes" all the fingerprints (edges) that are visited (note
+ * that in the cuckoo graph, the edges are the fingerprints) to their alternate buckets, and make
+ * room for {@code fingerprint} to be inserted.
+ *
+ * <p>If during the random walk an available bucket is not found, the insertion fails and the
+ * method returns false.
+ *
+ * <p>Note that it is possible to deterministically find an available bucket by performing breadth
+ * first search in the cuckoo graph, but this is usually slower and the extra chance of successful
+ * insertion is negligibly small in practice.
+ */
+ private boolean insertWithRepeatedReplacements(long fingerprint, int bucketIndex) {
+ List<Integer> visitedBucketIndices = new ArrayList<>();
+ List<Long> replacedFingerprints = new ArrayList<>();
+
+ long currFingerprint = fingerprint;
+ int currBucketIndex = bucketIndex;
+ visitedBucketIndices.add(-1); // Just for index alignment purpose.
+ replacedFingerprints.add(currFingerprint);
+ for (int i = 0; i < config.strategy().maxReplacementCount(); i++) {
+ Optional<Long> replacedFingerprint =
+ table.insertWithReplacement(currBucketIndex, currFingerprint);
+ // Found an available bucket, and the insertion is successful.
+ if (replacedFingerprint.isEmpty()) {
+ return true;
+ }
+
+ visitedBucketIndices.add(currBucketIndex);
+ replacedFingerprints.add(replacedFingerprint.get());
+
+ currFingerprint = replacedFingerprint.get();
+ currBucketIndex =
+ config
+ .strategy()
+ .computeOtherBucketIndex(
+ currFingerprint,
+ currBucketIndex,
+ config.size().bucketCount(),
+ config.hashFunction());
+ }
+
+ // Failed to find a bucket to insert. Reverse the replacements and declare that the insertion
+ // failed.
+ for (int i = visitedBucketIndices.size() - 1; i > 0; i--) {
+ int previousBucketIndex = visitedBucketIndices.get(i);
+ table.delete(previousBucketIndex, replacedFingerprints.get(i - 1));
+ table.insertWithReplacement(previousBucketIndex, replacedFingerprints.get(i));
+ }
+ return false;
+ }
+}