aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Makefile9
-rw-r--r--README.md57
-rw-r--r--benchmark.c155
-rw-r--r--domain-lookup.c158
-rw-r--r--domain-lookup.h10
-rw-r--r--test.c36
6 files changed, 425 insertions, 0 deletions
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..f83110e
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,9 @@
+CFLAGS = -march=native -pipe -fomit-frame-pointer -flto -O3
+
+all: test benchmark
+test: domain-lookup.c domain-lookup.h test.c
+benchmark: domain-lookup.c domain-lookup.h benchmark.c
+clean:
+ rm -f test benchmark
+
+.PHONY: clean
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..a606fc4
--- /dev/null
+++ b/README.md
@@ -0,0 +1,57 @@
+# Domain Lookup Tree
+### by Jason A. Donenfeld (<Jason@zx2c4.com>)
+
+Some applications, like [dnsmasq](http://www.thekelleys.org.uk/dnsmasq/doc.html), need to find the closest-match domain name. One way is by iterating through a list of domains and finding the one that has the longest length match. But a more efficient way is to build a tree of domain name components.
+
+This project provides a very simple C API for a domain name matching tree. The **[source code is simple, extremely well commented, and easy to read](http://git.zx2c4.com/domain-lookup-tree/tree/domain-lookup.c)**.
+
+## API
+
+#### `struct domain_lookup_tree* init_dlt()`
+
+Constructs a new tree head and returns a pointer to it.
+
+#### `void* find_dlt(struct domain_lookup_tree *head, const char *domain)`
+
+Returns the prior inserted data in `head` for a given `domain`.
+
+#### `int insert_dlt(struct domain_lookup_tree *head, const char *domain, void *data)`
+
+Inserts `data` into a tree given by `head` for a given `domain`. If `domain` has length of zero or is equal to `#`, it will act as the default match for non-matching lookups.
+
+
+## Example
+
+ struct domain_lookup_tree *domains = init_dlt();
+
+ insert_dlt(domains, "zx2c4.com", "zx2c4 root node");
+ insert_dlt(domains, "data.zx2c4.com", "zx2c4 data node");
+ insert_dlt(domains, "yahoo.co.uk", "yahoo.co.uk node");
+ insert_dlt(domains, "#", "generic node");
+
+ char *data = find_dlt(domains, "cheese.somewhere.zx2c4.com");
+ printf("zx2c4 root node == %s\n", data);
+
+## Benchmarks
+
+With gcc 4.7.2 on an Intel Core i7-3820QM, we show a **~587x speed improvement** over dnsmasq's old algorithm:
+
+ zx2c4@thinkpad ~/Projects/domain-lookup-tree $ make
+ cc -march=native -pipe -fomit-frame-pointer -flto -O3 benchmark.c domain-lookup.c domain-lookup.h -o benchmark
+
+ zx2c4@thinkpad ~/Projects/domain-lookup-tree $ ./benchmark
+ [+] Populating in-memory word list from /usr/share/dict/words.
+ [+] Truncating in-memory word list to 300 random words.
+ [+] Creating random lists of 100000 domains to insert and 100000 domains to query.
+ [+] Populating domain lookup tree.
+ [+] Performing lookup benchmarks:
+ [*] New method took 0.34 seconds.
+ [*] Old method took 199.45 seconds.
+ [*] The new method is 586.617647 times faster than the old method.
+ [+] Verifying that new and old methods produced identical results:
+ [*] New and old methods produced the same results.
+
+## Limitations & Improvements
+
+* Currently no helper functions for removing and freeing nodes
+* Instead of `strcmp`'ing, a hash function could be used
diff --git a/benchmark.c b/benchmark.c
new file mode 100644
index 0000000..1eb71fe
--- /dev/null
+++ b/benchmark.c
@@ -0,0 +1,155 @@
+#include "domain-lookup.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <time.h>
+
+static unsigned long range_rand(unsigned long min, unsigned long max)
+{
+ unsigned long base_random, range, remainder, bucket;
+
+ base_random = rand();
+ if (RAND_MAX == base_random)
+ return range_rand(min, max);
+ range = max - min;
+ remainder = RAND_MAX % range;
+ bucket = RAND_MAX / range;
+ if (base_random < RAND_MAX - remainder)
+ return min + base_random / bucket;
+ return range_rand(min, max);
+}
+
+
+static char* make_domain(char **dictionary, size_t size, size_t max_components)
+{
+ char *domain;
+ int i;
+
+ domain = malloc(512);
+ domain[0] = 0;
+
+ /* strcat can overflow. I don't care. */
+ for (i = 0; i < range_rand(2, max_components + 1); ++i) {
+ if (i > 0)
+ strcat(domain, ".");
+ strcat(domain, dictionary[range_rand(0, size)]);
+ }
+
+ return domain;
+}
+
+
+static void* old_find_dlt(char *domain, char **domains, size_t domains_size)
+{
+ unsigned int namelen, matchlen, domainlen, i;
+ char *matchstart, *out;
+
+ namelen = strlen(domain);
+ matchlen = 0;
+ out = 0;
+ for (i = 0; i < domains_size; ++i) {
+ domainlen = strlen(domains[i]);
+ matchstart = domain + namelen - domainlen;
+ if (namelen >= domainlen &&
+ (domainlen == 0 || namelen == domainlen || *(matchstart - 1) == '.' ) &&
+ domainlen >= matchlen && !strcmp(matchstart, domains[i])) {
+ matchlen = domainlen;
+ out = domains[i];
+ }
+ }
+ return out;
+}
+
+
+#define DICTIONARY_INITIAL_SIZE 250000
+#define DICTIONARY_TRUNCATED_SIZE 300
+#define INSERTED_DOMAINS_SIZE 100000
+#define QUERIED_DOMAINS_SIZE 100000
+#define INSERTED_DOMAINS_COMPONENTS 3
+#define QUERIED_DOMAINS_COMPONENTS 5
+#define DICTIONARY_PATH "/usr/share/dict/words"
+
+int main()
+{
+ FILE *dictionary_file;
+ char **dictionary, **inserted_domains, **queried_domains;
+ void **verification_new, **verification_old;
+ char *word;
+ size_t dictionary_size, word_size, i;
+ struct domain_lookup_tree *head;
+ clock_t start, old_time, new_time;
+
+ srand(time(NULL) + 42);
+
+ printf("[+] Populating in-memory word list from " DICTIONARY_PATH ".\n");
+
+ dictionary = malloc(DICTIONARY_INITIAL_SIZE * sizeof(char *));
+ dictionary_file = fopen(DICTIONARY_PATH, "r");
+ if (!dictionary_file || !dictionary)
+ return 1;
+ for (dictionary_size = 0; dictionary_size < DICTIONARY_INITIAL_SIZE; ++dictionary_size) {
+ word_size = 256;
+ word = malloc(word_size);
+ if (!word)
+ return 1;
+ if (getline(&word, &word_size, dictionary_file) < 0)
+ break;
+ word_size = strlen(word);
+ if (word[word_size - 1] == '\n')
+ word[word_size - 1] = 0;
+ dictionary[dictionary_size] = word;
+ }
+ fclose(dictionary_file);
+
+ printf("[+] Truncating in-memory word list to %d random words.\n", DICTIONARY_TRUNCATED_SIZE);
+
+ for (i = 0; i < DICTIONARY_TRUNCATED_SIZE; ++i)
+ dictionary[i] = dictionary[range_rand(0, dictionary_size)];
+
+ printf("[+] Creating random lists of %d domains to insert and %d domains to query.\n", INSERTED_DOMAINS_SIZE, QUERIED_DOMAINS_SIZE);
+
+ inserted_domains = malloc(INSERTED_DOMAINS_SIZE * sizeof(char *));
+ queried_domains = malloc(QUERIED_DOMAINS_SIZE * sizeof(char *));
+ verification_new = malloc(QUERIED_DOMAINS_SIZE * sizeof(char *));
+ verification_old = malloc(QUERIED_DOMAINS_SIZE * sizeof(char *));
+ if (!inserted_domains || !queried_domains || !verification_new || !verification_old)
+ return 1;
+
+ for (i = 0; i < QUERIED_DOMAINS_SIZE; ++i)
+ queried_domains[i] = make_domain(dictionary, DICTIONARY_TRUNCATED_SIZE, QUERIED_DOMAINS_COMPONENTS);
+ for (i = 0; i < INSERTED_DOMAINS_SIZE; ++i)
+ inserted_domains[i] = make_domain(dictionary, DICTIONARY_TRUNCATED_SIZE, INSERTED_DOMAINS_COMPONENTS);
+
+ printf("[+] Populating domain lookup tree.\n");
+
+ head = init_dlt();
+ for (i = 0; i < INSERTED_DOMAINS_SIZE; ++i)
+ insert_dlt(head, inserted_domains[i], inserted_domains[i]);
+
+ printf("[+] Performing lookup benchmarks:\n");
+
+ start = clock();
+ for (i = 0; i < QUERIED_DOMAINS_SIZE; ++i)
+ verification_new[i] = find_dlt(head, queried_domains[i]);
+ new_time = clock() - start;
+ printf(" [*] New method took %.2f seconds.\n", (double)new_time / CLOCKS_PER_SEC);
+
+ start = clock();
+ for (i = 0; i < QUERIED_DOMAINS_SIZE; ++i)
+ verification_old[i] = old_find_dlt(queried_domains[i], inserted_domains, INSERTED_DOMAINS_SIZE);
+ old_time = clock() - start;
+ printf(" [*] Old method took %.2f seconds.\n", (double)old_time / CLOCKS_PER_SEC);
+
+ printf(" [*] The new method is %f times faster than the old method.\n", (double)old_time / (double)new_time);
+
+ printf("[+] Verifying that new and old methods produced identical results:\n");
+ for (i = 0; i < QUERIED_DOMAINS_SIZE; ++i) {
+ if (verification_old[i] != verification_new[i]) {
+ printf(" [!] New and old method produced different results!\n");
+ return 1;
+ }
+ }
+ printf(" [*] New and old methods produced the same results.\n");
+ return 0;
+}
diff --git a/domain-lookup.c b/domain-lookup.c
new file mode 100644
index 0000000..2adfe8d
--- /dev/null
+++ b/domain-lookup.c
@@ -0,0 +1,158 @@
+/* domain-lookup.c is Copyright (c) 2013 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 dated June, 1991, or
+ (at your option) version 3 dated 29 June, 2007.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+
+
+ this.is.how.domains.are.stored.com
+ seeing.how.domains.are.stored.com
+ is.very.interesting.com
+
+ [head]
+ |
+ com
+ /\
+ stored interesting
+ / \
+ are very
+ / \
+ domains is
+ /
+ how
+ /\
+ is seeing
+ /
+ this
+
+*/
+
+
+#include "domain-lookup.h"
+#include <string.h>
+#include <stdlib.h>
+
+/* Single internal function for walking, defined bellow. */
+static struct domain_lookup_tree* walk_dlt(struct domain_lookup_tree *head, const char *domain_immutable, int with_data);
+
+/* The following three functions comprise the entire API. */
+struct domain_lookup_tree* init_dlt()
+{
+ struct domain_lookup_tree *head = malloc(sizeof(struct domain_lookup_tree));
+ if (!head)
+ return 0;
+ memset(head, 0, sizeof(struct domain_lookup_tree));
+ return head;
+}
+void* find_dlt(struct domain_lookup_tree *head, const char *domain)
+{
+ /* If head is non-null, walk_dlt will always return non-null. */
+ return walk_dlt(head, domain, 1)->data;
+}
+int insert_dlt(struct domain_lookup_tree *head, const char *domain, void *data)
+{
+ head = walk_dlt(head, domain, 0);
+ if (!head) /* This should only fail when malloc fails. */
+ return -1;
+ head->data = data;
+ return 0;
+}
+
+/* Where the actual meat happens. This walks down the tree starting at head.
+ * If with_data is true, it returns the closest matching domain's data that's
+ * been explicitly inserted. Otherwise, it returns an existing or newly-created
+ * node for the provided domain. */
+static struct domain_lookup_tree* walk_dlt(struct domain_lookup_tree *head, const char *domain, int with_data)
+{
+ struct domain_lookup_tree *child, *data_head;
+ const char *component, *right_dot;
+ int i, len, component_len;
+
+ /* We immediately return the root node for null, "", and "#". */
+ if (!domain)
+ return head;
+ len = strlen(domain);
+ if (!len)
+ return head;
+ if (!strcmp(domain, "#"))
+ return head;
+
+ /* data_head keeps track of the closest match that has been
+ * explicitly inserted -- which amounts to a node having a non-null
+ * data member. */
+ data_head = head;
+
+ /* The right-most dot starts as the null byte at the end of the string. */
+ right_dot = &domain[len];
+
+ /* Iterate backward through the domain. */
+ for (i = len - 1; i >= 0; --i) {
+ /* We're only interested when we reach a dot or the first character. */
+ if (domain[i] != '.' && i)
+ continue;
+
+ /* If we're at a dot, then our component is everything to the right of the dot.
+ * Note that this does not overflow, because right_dot is initialized to be
+ * domain[len], which is the max value of i + 1. */
+ if (domain[i] == '.')
+ component = &domain[i + 1];
+ else /* Otherwise, we're at the first character of the domain, and it's not a dot, */
+ component = &domain[i]; /* so our component just begins at this index. */
+
+ /* right_dot - component is the length of the component, up to the previous right dot. */
+ component_len = (int)(right_dot - component);
+
+ /* We adjust right_dot to be this latest dot we've found. If domain[i] isn't a dot,
+ i == 0, so this will be the last iteration anyway. */
+ right_dot = &domain[i];
+
+ /* If the component has no length, we skip it. This automatically elides adjacent dots. */
+ if (!component_len)
+ continue;
+
+ /* We iterate through all the children of the current head looking for a match. */
+ for (child = head->first_child; child; child = child->next_sibling) {
+ /* We want to compare only component_len bytes and also check that they have the same len. */
+ if (!strncmp(component, child->component, component_len) && strlen(child->component) == component_len)
+ break;
+ }
+
+ if (child) { /* child is non-null in the case where a match is found. */
+ /* We update the current head and make sure to update data_head
+ * if we found a child that was explicitly added. */
+ head = child;
+ if (head->data)
+ data_head = head;
+ } else { /* child is null in the case where no matching component was found. */
+ /* If we're just searching for the closest explicitly added
+ * match, return it immediately. */
+ if (with_data)
+ return data_head;
+ /* Otherwise, create a new child node and insert it. */
+ child = init_dlt();
+ if (!child) /* init_dlt calls malloc, which can fail. */
+ return 0;
+ /* The same length calculation from before. */
+ child->component = strndup(component, component_len);
+ if (!child->component) /* strdup calls malloc, which can fail. */
+ return 0;
+ /* Insert the new child at the front of the child list for the current head. */
+ child->next_sibling = head->first_child;
+ head->first_child = child;
+ /* Update the head to the new child. */
+ head = child;
+ }
+ }
+ /* If we made it here, we've either matched for every component, or created [a] new child[ren].
+ * In either case, we return either data_head or head, depending on which was requested. */
+ if (with_data)
+ return data_head;
+ return head;
+}
diff --git a/domain-lookup.h b/domain-lookup.h
new file mode 100644
index 0000000..8e45dee
--- /dev/null
+++ b/domain-lookup.h
@@ -0,0 +1,10 @@
+struct domain_lookup_tree {
+ char *component;
+ void *data;
+ struct domain_lookup_tree *first_child;
+ struct domain_lookup_tree *next_sibling;
+};
+
+struct domain_lookup_tree* init_dlt();
+void* find_dlt(struct domain_lookup_tree *head, const char *domain);
+int insert_dlt(struct domain_lookup_tree *head, const char *domain, void *data);
diff --git a/test.c b/test.c
new file mode 100644
index 0000000..a1a1956
--- /dev/null
+++ b/test.c
@@ -0,0 +1,36 @@
+#include "domain-lookup.h"
+#include <stdio.h>
+
+#define print_lookup(head, domain) printf("%s: %s\n", domain, (char *)find_dlt(head, domain))
+
+void main()
+{
+ struct domain_lookup_tree *head = init_dlt();
+ insert_dlt(head, "zx2c4.com", "zx2c4 root node");
+ insert_dlt(head, ".data.zx2c4.com", "zx2c4 data node");
+ insert_dlt(head, "co.uk.", "co.uk root node");
+ insert_dlt(head, "yahoo.co.uk", "yahoo.co.uk node");
+ insert_dlt(head, "yahoo.co.uk", "yahoo.co.uk node overwrite");
+ insert_dlt(head, "something.co.uk", "something town, london");
+ insert_dlt(head, "yonder.co.uk", "yonder town, manchester");
+ insert_dlt(head, "#", "null node");
+
+ print_lookup(head, "blog.zx2c4.com");
+ print_lookup(head, "data.zx2c4.com");
+ print_lookup(head, "dat.zx2c4.com");
+ print_lookup(head, "zx2c4.com");
+ print_lookup(head, "blala.asdf.adsf.adsf.data.zx2c4.com");
+ print_lookup(head, "");
+ print_lookup(head, "yahoo.com.");
+ print_lookup(head, ".news.yahoo.co.uk.");
+ print_lookup(head, "british.co.uk");
+ print_lookup(head, "other.com");
+ print_lookup(head, "test.jj.uk");
+ print_lookup(head, "tasdf.asdf.jj.uk.");
+ print_lookup(head, "yahoo.co.uk.bad");
+ print_lookup(head, "..yahoo.co.uk");
+ print_lookup(head, ".ahaan....data...zx2c4.com");
+ print_lookup(head, "data..zx2c4.com...");
+ print_lookup(head, "asdf.yonder.co.uk");
+ print_lookup(head, "ananana.something.co.uk.");
+}