netsurf: branch rjek/bloom created. release/3.0-73-g7d60132

NetSurf Browser Project (Commit Mailer) no-reply at netsurf-browser.org
Thu May 16 17:15:41 BST 2013


Gitweb links:

...log http://git.netsurf-browser.org/netsurf.git/shortlog/7d60132816341e15a853f2f66f06d0755d0f2daa
...commit http://git.netsurf-browser.org/netsurf.git/commit/7d60132816341e15a853f2f66f06d0755d0f2daa
...tree http://git.netsurf-browser.org/netsurf.git/tree/7d60132816341e15a853f2f66f06d0755d0f2daa

The branch, rjek/bloom has been created
        at  7d60132816341e15a853f2f66f06d0755d0f2daa (commit)

- Log -----------------------------------------------------------------
commitdiff http://git.netsurf-browser.org/netsurf.git/commit/?id=7d60132816341e15a853f2f66f06d0755d0f2daa
commit 7d60132816341e15a853f2f66f06d0755d0f2daa
Author: Rob Kendrick (humdrum) <rob.kendrick at codethink.co.uk>
Commit: Rob Kendrick (humdrum) <rob.kendrick at codethink.co.uk>

    Painfully trivial bloom filter implementation to experiment with optimising urldb lookups

diff --git a/utils/bloom.c b/utils/bloom.c
new file mode 100644
index 0000000..e6b9dcf
--- /dev/null
+++ b/utils/bloom.c
@@ -0,0 +1,162 @@
+/*
+ * Copyright 2013 Rob Kendrick <rjek at netsurf-browser.org>
+ *
+ * This file is part of NetSurf, http://www.netsurf-browser.org/
+ *
+ * NetSurf is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * NetSurf is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** \file
+ * Trivial bloom filter */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils/bloom.h"
+
+/**
+ * Hash a string, returning a 32bit value.  The hash algorithm used is
+ * Fowler Noll Vo - a very fast and simple hash, ideal for short strings.
+ * See http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash for more details.
+ *
+ * \param  datum   The string to hash.
+ * \param  len	   size_t of data length.
+ * \return The calculated hash value for the datum.
+ */
+
+static inline uint32_t fnv(const char *datum, size_t len)
+{
+	uint32_t z = 0x811c9dc5;
+	
+	if (datum == NULL)
+		return 0;
+
+	while (len--) {
+		z *= 0x01000193;
+		z ^= *datum++;
+	}
+
+	return z;
+}
+
+struct bloom_filter {
+	size_t size;
+	uint32_t items;
+	uint8_t filter[];
+};
+
+struct bloom_filter *bloom_create(size_t size)
+{
+	struct bloom_filter *r = calloc(sizeof(*r) + size, 1);
+        
+	if (r == NULL)
+		return NULL;
+        
+	r->size = size;
+        
+	return r;
+}
+
+void bloom_destroy(struct bloom_filter *b)
+{
+        free(b);
+}
+
+void bloom_insert_str(struct bloom_filter *b, const char *s, size_t z)
+{
+	uint32_t hash = fnv(s, z);
+	bloom_insert_hash(b, hash);
+}
+
+void bloom_insert_hash(struct bloom_filter *b, uint32_t hash)
+{
+	int index = hash % b->size;
+	int bit = hash % 8;
+
+	b->filter[index] |= (1 << bit);
+	b->items++;
+}
+
+bool bloom_search_str(struct bloom_filter *b, const char *s, size_t z)
+{
+	uint32_t hash = fnv(s, z);
+	return bloom_search_hash(b, hash);
+}
+
+bool bloom_search_hash(struct bloom_filter *b, uint32_t hash)
+{
+	int index = hash % b->size;
+	int bit = hash % 8;	
+	
+	return (b->filter[index] & (1 << bit)) != 0;
+}
+
+uint32_t bloom_items(struct bloom_filter *b)
+{
+	return b->items;
+}
+
+#ifdef TEST_RIG
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+int main(int argc, char *arg[])
+{
+	struct bloom_filter *b = bloom_create(8192);
+	FILE *dict = fopen("/usr/share/dict/words", "r");
+	char buf[BUFSIZ];
+	int false_positives = 0, total = 0;
+	
+	for (int i = 0; i < 8192; i++) {
+		fscanf(dict, "%s", buf);
+		printf("adding %s\n", buf);
+		bloom_insert_str(b, buf, strlen(buf));
+	}
+	
+	printf("adding NetSurf\n");
+	
+	bloom_insert_str(b, "NetSurf", 7);
+	printf("checking NetSurf (should be true)\n");
+	assert(bloom_search_str(b, "NetSurf", 7));
+	
+	fseek(dict, 0, SEEK_SET);
+	
+	for (int i = 0; i < 8192; i++) {
+		fscanf(dict, "%s", buf);
+		printf("checking %s (should be true)\n", buf);
+		assert(bloom_search_str(b, buf, strlen(buf)));
+		
+		total++;
+	}
+	
+	for (int i = 0; i < 8192; i++) {
+		fscanf(dict, "%s", buf);
+		printf("checking %s (should be false)\n", buf);
+		if (bloom_search_str(b, buf, strlen(buf)) == true)
+			false_positives++;
+		total++;
+	}
+	
+	printf("false positives: %d of %d, %f%%\n",
+		false_positives, total,
+		((float)false_positives / total) * 100);
+	
+	fclose(dict);
+	bloom_destroy(b);
+	
+	return 0;
+}
+
+#endif /* TEST_RIG */
+
diff --git a/utils/bloom.h b/utils/bloom.h
new file mode 100644
index 0000000..4a7bd38
--- /dev/null
+++ b/utils/bloom.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2013 Rob Kendrick <rjek at netsurf-browser.org>
+ *
+ * This file is part of NetSurf, http://www.netsurf-browser.org/
+ *
+ * NetSurf is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * NetSurf is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** \file
+ * Trivial bloom filter */
+
+#ifndef _NETSURF_UTILS_BLOOM_H_
+#define _NETSURF_UTILS_BLOOM_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+struct bloom_filter;
+
+/**
+ * Create a new bloom filter.
+ * 
+ * \param size Size of bloom filter in bytes
+ * \return Handle for newly-created bloom filter, or NULL
+ */
+struct bloom_filter *bloom_create(size_t size);
+
+/**
+ * Destroy a previously-created bloom filter
+ * 
+ * \param b Bloom filter to destroy
+ */
+void bloom_destroy(struct bloom_filter *b);
+
+/**
+ * Insert a string of given length (may include NULs) into the filter,
+ * using an internal hash function.
+ * 
+ * \param b Bloom filter to add to
+ * \param s Pointer to data
+ * \param z Length of data
+ */
+void bloom_insert_str(struct bloom_filter *b, const char *s, size_t z);
+
+/**
+ * Insert a given hash value into the filter, should you already have
+ * one to hand.
+ * 
+ * \param b Bloom filter to add to
+ * \param hash Value to add
+ */
+void bloom_insert_hash(struct bloom_filter *b, uint32_t hash);
+
+/**
+ * Search the filter for the given string, assuming it was added by
+ * bloom_insert_str().   May return false-positives.
+ * 
+ * \param b Bloom filter to search
+ * \param s Pointer to data to search for
+ * \param z Length of data
+ * 
+ * \return False if never added, True if it might have been.
+ */
+bool bloom_search_str(struct bloom_filter *b, const char *s, size_t z);
+
+/**
+ * Search the filter for the given hash value, assuming it was added by
+ * bloom_insert_hash().  May return false-positives.
+ * 
+ * \param b Bloom filter to search
+ * \param hash Hash value to search for
+ * 
+ * \return False if never added, True if it might have been.
+ */
+bool bloom_search_hash(struct bloom_filter *b, uint32_t hash);
+
+/**
+ * Find out how many items have been added to this bloom filter.  This
+ * is useful for deciding the size of a new bloom filter should you
+ * need to rehash it.
+ * 
+ * \param b Bloom filter to examine
+ * 
+ * \return Number of items that have been added
+ */
+uint32_t bloom_items(struct bloom_filter *b);
+
+#endif


-----------------------------------------------------------------------


-- 
NetSurf Browser



More information about the netsurf-commits mailing list