summaryrefslogtreecommitdiff
path: root/deps/theft/theft_bloom.c
blob: 7b4ddb80b2c48c187196fe5b339679e0e35a973c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#include <string.h>
#include <stdio.h>

#include "theft.h"
#include "theft_bloom.h"

#define DEFAULT_BLOOM_BITS 17
#define DEBUG_BLOOM_FILTER 0
#define LOG2_BITS_PER_BYTE 3
#define HASH_COUNT 4

static uint8_t get_bits_set_count(uint8_t counts[256], uint8_t byte);

/* Initialize a bloom filter. */
struct theft_bloom *theft_bloom_init(uint8_t bit_size2) {
    size_t sz = 1 << (bit_size2 - LOG2_BITS_PER_BYTE);
    struct theft_bloom *b = malloc(sizeof(struct theft_bloom) + sz);
    if (b) {
        b->size = sz;
        b->bit_count = bit_size2;
        memset(b->bits, 0, sz);
    }
    return b;
}

/* Hash data and mark it in the bloom filter. */
void theft_bloom_mark(struct theft_bloom *b, uint8_t *data, size_t data_size) {
    uint64_t hash = theft_hash_onepass(data, data_size);
    uint8_t bc = b->bit_count;
    uint64_t mask = (1 << bc) - 1;

    /* Use HASH_COUNT distinct slices of MASK bits as hashes for the bloom filter. */
    int bit_inc = (64 - bc) / HASH_COUNT;

    for (int i = 0; i < (64 - bc); i += bit_inc) {
        uint64_t v = (hash & (mask << i)) >> i;
        size_t offset = v / 8;
        uint8_t bit = 1 << (v & 0x07);
        b->bits[offset] |= bit;
    }
}

/* Check whether the data's hash is in the bloom filter. */
bool theft_bloom_check(struct theft_bloom *b, uint8_t *data, size_t data_size) {
    uint64_t hash = theft_hash_onepass(data, data_size);
    uint8_t bc = b->bit_count;
    uint64_t mask = (1 << bc) - 1;

    int bit_inc = (64 - bc) / HASH_COUNT;

    for (int i = 0; i < (64 - bc); i += bit_inc) {
        uint64_t v = (hash & (mask << i)) >> i;
        size_t offset = v / 8;
        uint8_t bit = 1 << (v & 0x07);
        if (0 == (b->bits[offset] & bit)) { return false; }
    }

    return true;
}

/* Free the bloom filter. */
void theft_bloom_free(struct theft_bloom *b) { free(b); }

/* Dump the bloom filter's contents. (Debugging.) */
void theft_bloom_dump(struct theft_bloom *b) {
    uint8_t counts[256];
    memset(counts, 0xFF, sizeof(counts));

    size_t total = 0;
    uint16_t row_total = 0;
    
    for (size_t i = 0; i < b->size; i++) {
        uint8_t count = get_bits_set_count(counts, b->bits[i]);
        total += count;
        row_total += count;
        #if DEBUG_BLOOM_FILTER > 1
        char c = (count == 0 ? '.' : '0' + count);
        printf("%c", c);
        if ((i & 63) == 0 || i == b->size - 1) {
            printf(" - %2.1f%%\n",
                (100 * row_total) / (64.0 * 8));
            row_total = 0;
        }
        #endif
    }

    #if DEBUG_BLOOM_FILTER
    printf(" -- bloom filter: %zd of %zd bits set (%2d%%)\n",
        total, 8*b->size, (int)((100.0 * total)/(8.0*b->size)));
    #endif

    /* If the total number of bits set is > the number of bytes
     * in the table (i.e., > 1/8 full) then warn the user. */
    if (total > b->size) {
        fprintf(stderr, "\nWARNING: bloom filter is %zd%% full, "
            "larger bloom_bits value recommended.\n",
            (size_t)((100 * total) / (8 * b->size)));
    }
}

/* Recommend a bloom filter size for a given number of trials. */
uint8_t theft_bloom_recommendation(int trials) {
    /* With a preferred priority of false positives under 0.1%,
     * the required number of bits m in the bloom filter is:
     *     m = -lg(0.001)/(lg(2)^2) == 14.378 ~= 14,
     * so we want an array with 1 << ceil(log2(14*trials)) cells.
     *
     * Note: The above formula is for the *ideal* number of hash
     * functions, but we're using a hardcoded count. It appears to work
     * well enough in practice, though, and this can be adjusted later
     * without impacting the API. */
    #define TRIAL_MULTIPILER 14
    uint8_t res = DEFAULT_BLOOM_BITS;

    const uint8_t min = THEFT_BLOOM_BITS_MIN - LOG2_BITS_PER_BYTE;
    const uint8_t max = THEFT_BLOOM_BITS_MAX - LOG2_BITS_PER_BYTE;

    for (uint8_t i = min; i < max; i++) {
        int32_t v = (1 << i);
        if (v > (TRIAL_MULTIPILER * trials)) {
            res = i + LOG2_BITS_PER_BYTE;
            break;
        }
    }

    #if DEBUG_BLOOM_FILTER
    size_t sz = 1 << (res - LOG2_BITS_PER_BYTE);
    printf("Using %zd bytes for bloom filter: %d trials -> bit_size2 %u\n",
        sizeof(struct theft_bloom) + sz, trials, res);
    #endif

    return res;
}

/* Check a byte->bits set table, and lazily populate it. */
static uint8_t get_bits_set_count(uint8_t counts[256], uint8_t byte) {
    uint8_t v = counts[byte];
    if (v != 0xFF) { return v; }
    uint8_t t = 0;
    for (uint8_t i = 0; i < 8; i++) {
        if (byte & (1 << i)) { t++; }
    }
    counts[byte] = t;
    return t;
}