summaryrefslogtreecommitdiff
path: root/deps/theft/theft_bloom.c
diff options
context:
space:
mode:
Diffstat (limited to 'deps/theft/theft_bloom.c')
-rw-r--r--deps/theft/theft_bloom.c145
1 files changed, 145 insertions, 0 deletions
diff --git a/deps/theft/theft_bloom.c b/deps/theft/theft_bloom.c
new file mode 100644
index 0000000..7b4ddb8
--- /dev/null
+++ b/deps/theft/theft_bloom.c
@@ -0,0 +1,145 @@
+#include <string.h>
+#include <stdio.h>
+
+#include "theft.h"
+#include "theft_bloom.h"
+
+#define DEFAULT_BLOOM_BITS 17
+#define DEBUG_BLOOM_FILTER 0
+#define LOG2_BITS_PER_BYTE 3
+#define HASH_COUNT 4
+
+static uint8_t get_bits_set_count(uint8_t counts[256], uint8_t byte);
+
+/* Initialize a bloom filter. */
+struct theft_bloom *theft_bloom_init(uint8_t bit_size2) {
+ size_t sz = 1 << (bit_size2 - LOG2_BITS_PER_BYTE);
+ struct theft_bloom *b = malloc(sizeof(struct theft_bloom) + sz);
+ if (b) {
+ b->size = sz;
+ b->bit_count = bit_size2;
+ memset(b->bits, 0, sz);
+ }
+ return b;
+}
+
+/* Hash data and mark it in the bloom filter. */
+void theft_bloom_mark(struct theft_bloom *b, uint8_t *data, size_t data_size) {
+ uint64_t hash = theft_hash_onepass(data, data_size);
+ uint8_t bc = b->bit_count;
+ uint64_t mask = (1 << bc) - 1;
+
+ /* Use HASH_COUNT distinct slices of MASK bits as hashes for the bloom filter. */
+ int bit_inc = (64 - bc) / HASH_COUNT;
+
+ for (int i = 0; i < (64 - bc); i += bit_inc) {
+ uint64_t v = (hash & (mask << i)) >> i;
+ size_t offset = v / 8;
+ uint8_t bit = 1 << (v & 0x07);
+ b->bits[offset] |= bit;
+ }
+}
+
+/* Check whether the data's hash is in the bloom filter. */
+bool theft_bloom_check(struct theft_bloom *b, uint8_t *data, size_t data_size) {
+ uint64_t hash = theft_hash_onepass(data, data_size);
+ uint8_t bc = b->bit_count;
+ uint64_t mask = (1 << bc) - 1;
+
+ int bit_inc = (64 - bc) / HASH_COUNT;
+
+ for (int i = 0; i < (64 - bc); i += bit_inc) {
+ uint64_t v = (hash & (mask << i)) >> i;
+ size_t offset = v / 8;
+ uint8_t bit = 1 << (v & 0x07);
+ if (0 == (b->bits[offset] & bit)) { return false; }
+ }
+
+ return true;
+}
+
+/* Free the bloom filter. */
+void theft_bloom_free(struct theft_bloom *b) { free(b); }
+
+/* Dump the bloom filter's contents. (Debugging.) */
+void theft_bloom_dump(struct theft_bloom *b) {
+ uint8_t counts[256];
+ memset(counts, 0xFF, sizeof(counts));
+
+ size_t total = 0;
+ uint16_t row_total = 0;
+
+ for (size_t i = 0; i < b->size; i++) {
+ uint8_t count = get_bits_set_count(counts, b->bits[i]);
+ total += count;
+ row_total += count;
+ #if DEBUG_BLOOM_FILTER > 1
+ char c = (count == 0 ? '.' : '0' + count);
+ printf("%c", c);
+ if ((i & 63) == 0 || i == b->size - 1) {
+ printf(" - %2.1f%%\n",
+ (100 * row_total) / (64.0 * 8));
+ row_total = 0;
+ }
+ #endif
+ }
+
+ #if DEBUG_BLOOM_FILTER
+ printf(" -- bloom filter: %zd of %zd bits set (%2d%%)\n",
+ total, 8*b->size, (int)((100.0 * total)/(8.0*b->size)));
+ #endif
+
+ /* If the total number of bits set is > the number of bytes
+ * in the table (i.e., > 1/8 full) then warn the user. */
+ if (total > b->size) {
+ fprintf(stderr, "\nWARNING: bloom filter is %zd%% full, "
+ "larger bloom_bits value recommended.\n",
+ (size_t)((100 * total) / (8 * b->size)));
+ }
+}
+
+/* Recommend a bloom filter size for a given number of trials. */
+uint8_t theft_bloom_recommendation(int trials) {
+ /* With a preferred priority of false positives under 0.1%,
+ * the required number of bits m in the bloom filter is:
+ * m = -lg(0.001)/(lg(2)^2) == 14.378 ~= 14,
+ * so we want an array with 1 << ceil(log2(14*trials)) cells.
+ *
+ * Note: The above formula is for the *ideal* number of hash
+ * functions, but we're using a hardcoded count. It appears to work
+ * well enough in practice, though, and this can be adjusted later
+ * without impacting the API. */
+ #define TRIAL_MULTIPILER 14
+ uint8_t res = DEFAULT_BLOOM_BITS;
+
+ const uint8_t min = THEFT_BLOOM_BITS_MIN - LOG2_BITS_PER_BYTE;
+ const uint8_t max = THEFT_BLOOM_BITS_MAX - LOG2_BITS_PER_BYTE;
+
+ for (uint8_t i = min; i < max; i++) {
+ int32_t v = (1 << i);
+ if (v > (TRIAL_MULTIPILER * trials)) {
+ res = i + LOG2_BITS_PER_BYTE;
+ break;
+ }
+ }
+
+ #if DEBUG_BLOOM_FILTER
+ size_t sz = 1 << (res - LOG2_BITS_PER_BYTE);
+ printf("Using %zd bytes for bloom filter: %d trials -> bit_size2 %u\n",
+ sizeof(struct theft_bloom) + sz, trials, res);
+ #endif
+
+ return res;
+}
+
+/* Check a byte->bits set table, and lazily populate it. */
+static uint8_t get_bits_set_count(uint8_t counts[256], uint8_t byte) {
+ uint8_t v = counts[byte];
+ if (v != 0xFF) { return v; }
+ uint8_t t = 0;
+ for (uint8_t i = 0; i < 8; i++) {
+ if (byte & (1 << i)) { t++; }
+ }
+ counts[byte] = t;
+ return t;
+}