diff options
-rw-r--r-- | Makefile | 9 | ||||
-rw-r--r-- | README.md | 57 | ||||
-rw-r--r-- | benchmark.c | 155 | ||||
-rw-r--r-- | domain-lookup.c | 158 | ||||
-rw-r--r-- | domain-lookup.h | 10 | ||||
-rw-r--r-- | test.c | 36 |
6 files changed, 425 insertions, 0 deletions
diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..f83110e --- /dev/null +++ b/Makefile @@ -0,0 +1,9 @@ +CFLAGS = -march=native -pipe -fomit-frame-pointer -flto -O3 + +all: test benchmark +test: domain-lookup.c domain-lookup.h test.c +benchmark: domain-lookup.c domain-lookup.h benchmark.c +clean: + rm -f test benchmark + +.PHONY: clean diff --git a/README.md b/README.md new file mode 100644 index 0000000..a606fc4 --- /dev/null +++ b/README.md @@ -0,0 +1,57 @@ +# Domain Lookup Tree +### by Jason A. Donenfeld (<Jason@zx2c4.com>) + +Some applications, like [dnsmasq](http://www.thekelleys.org.uk/dnsmasq/doc.html), need to find the closest-match domain name. One way is by iterating through a list of domains and finding the one that has the longest length match. But a more efficient way is to build a tree of domain name components. + +This project provides a very simple C API for a domain name matching tree. The **[source code is simple, extremely well commented, and easy to read](http://git.zx2c4.com/domain-lookup-tree/tree/domain-lookup.c)**. + +## API + +#### `struct domain_lookup_tree* init_dlt()` + +Constructs a new tree head and returns a pointer to it. + +#### `void* find_dlt(struct domain_lookup_tree *head, const char *domain)` + +Returns the prior inserted data in `head` for a given `domain`. + +#### `int insert_dlt(struct domain_lookup_tree *head, const char *domain, void *data)` + +Inserts `data` into a tree given by `head` for a given `domain`. If `domain` has length of zero or is equal to `#`, it will act as the default match for non-matching lookups. + + +## Example + + struct domain_lookup_tree *domains = init_dlt(); + + insert_dlt(domains, "zx2c4.com", "zx2c4 root node"); + insert_dlt(domains, "data.zx2c4.com", "zx2c4 data node"); + insert_dlt(domains, "yahoo.co.uk", "yahoo.co.uk node"); + insert_dlt(domains, "#", "generic node"); + + char *data = find_dlt(domains, "cheese.somewhere.zx2c4.com"); + printf("zx2c4 root node == %s\n", data); + +## Benchmarks + +With gcc 4.7.2 on an Intel Core i7-3820QM, we show a **~587x speed improvement** over dnsmasq's old algorithm: + + zx2c4@thinkpad ~/Projects/domain-lookup-tree $ make + cc -march=native -pipe -fomit-frame-pointer -flto -O3 benchmark.c domain-lookup.c domain-lookup.h -o benchmark + + zx2c4@thinkpad ~/Projects/domain-lookup-tree $ ./benchmark + [+] Populating in-memory word list from /usr/share/dict/words. + [+] Truncating in-memory word list to 300 random words. + [+] Creating random lists of 100000 domains to insert and 100000 domains to query. + [+] Populating domain lookup tree. + [+] Performing lookup benchmarks: + [*] New method took 0.34 seconds. + [*] Old method took 199.45 seconds. + [*] The new method is 586.617647 times faster than the old method. + [+] Verifying that new and old methods produced identical results: + [*] New and old methods produced the same results. + +## Limitations & Improvements + +* Currently no helper functions for removing and freeing nodes +* Instead of `strcmp`'ing, a hash function could be used diff --git a/benchmark.c b/benchmark.c new file mode 100644 index 0000000..1eb71fe --- /dev/null +++ b/benchmark.c @@ -0,0 +1,155 @@ +#include "domain-lookup.h" +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <math.h> +#include <time.h> + +static unsigned long range_rand(unsigned long min, unsigned long max) +{ + unsigned long base_random, range, remainder, bucket; + + base_random = rand(); + if (RAND_MAX == base_random) + return range_rand(min, max); + range = max - min; + remainder = RAND_MAX % range; + bucket = RAND_MAX / range; + if (base_random < RAND_MAX - remainder) + return min + base_random / bucket; + return range_rand(min, max); +} + + +static char* make_domain(char **dictionary, size_t size, size_t max_components) +{ + char *domain; + int i; + + domain = malloc(512); + domain[0] = 0; + + /* strcat can overflow. I don't care. */ + for (i = 0; i < range_rand(2, max_components + 1); ++i) { + if (i > 0) + strcat(domain, "."); + strcat(domain, dictionary[range_rand(0, size)]); + } + + return domain; +} + + +static void* old_find_dlt(char *domain, char **domains, size_t domains_size) +{ + unsigned int namelen, matchlen, domainlen, i; + char *matchstart, *out; + + namelen = strlen(domain); + matchlen = 0; + out = 0; + for (i = 0; i < domains_size; ++i) { + domainlen = strlen(domains[i]); + matchstart = domain + namelen - domainlen; + if (namelen >= domainlen && + (domainlen == 0 || namelen == domainlen || *(matchstart - 1) == '.' ) && + domainlen >= matchlen && !strcmp(matchstart, domains[i])) { + matchlen = domainlen; + out = domains[i]; + } + } + return out; +} + + +#define DICTIONARY_INITIAL_SIZE 250000 +#define DICTIONARY_TRUNCATED_SIZE 300 +#define INSERTED_DOMAINS_SIZE 100000 +#define QUERIED_DOMAINS_SIZE 100000 +#define INSERTED_DOMAINS_COMPONENTS 3 +#define QUERIED_DOMAINS_COMPONENTS 5 +#define DICTIONARY_PATH "/usr/share/dict/words" + +int main() +{ + FILE *dictionary_file; + char **dictionary, **inserted_domains, **queried_domains; + void **verification_new, **verification_old; + char *word; + size_t dictionary_size, word_size, i; + struct domain_lookup_tree *head; + clock_t start, old_time, new_time; + + srand(time(NULL) + 42); + + printf("[+] Populating in-memory word list from " DICTIONARY_PATH ".\n"); + + dictionary = malloc(DICTIONARY_INITIAL_SIZE * sizeof(char *)); + dictionary_file = fopen(DICTIONARY_PATH, "r"); + if (!dictionary_file || !dictionary) + return 1; + for (dictionary_size = 0; dictionary_size < DICTIONARY_INITIAL_SIZE; ++dictionary_size) { + word_size = 256; + word = malloc(word_size); + if (!word) + return 1; + if (getline(&word, &word_size, dictionary_file) < 0) + break; + word_size = strlen(word); + if (word[word_size - 1] == '\n') + word[word_size - 1] = 0; + dictionary[dictionary_size] = word; + } + fclose(dictionary_file); + + printf("[+] Truncating in-memory word list to %d random words.\n", DICTIONARY_TRUNCATED_SIZE); + + for (i = 0; i < DICTIONARY_TRUNCATED_SIZE; ++i) + dictionary[i] = dictionary[range_rand(0, dictionary_size)]; + + printf("[+] Creating random lists of %d domains to insert and %d domains to query.\n", INSERTED_DOMAINS_SIZE, QUERIED_DOMAINS_SIZE); + + inserted_domains = malloc(INSERTED_DOMAINS_SIZE * sizeof(char *)); + queried_domains = malloc(QUERIED_DOMAINS_SIZE * sizeof(char *)); + verification_new = malloc(QUERIED_DOMAINS_SIZE * sizeof(char *)); + verification_old = malloc(QUERIED_DOMAINS_SIZE * sizeof(char *)); + if (!inserted_domains || !queried_domains || !verification_new || !verification_old) + return 1; + + for (i = 0; i < QUERIED_DOMAINS_SIZE; ++i) + queried_domains[i] = make_domain(dictionary, DICTIONARY_TRUNCATED_SIZE, QUERIED_DOMAINS_COMPONENTS); + for (i = 0; i < INSERTED_DOMAINS_SIZE; ++i) + inserted_domains[i] = make_domain(dictionary, DICTIONARY_TRUNCATED_SIZE, INSERTED_DOMAINS_COMPONENTS); + + printf("[+] Populating domain lookup tree.\n"); + + head = init_dlt(); + for (i = 0; i < INSERTED_DOMAINS_SIZE; ++i) + insert_dlt(head, inserted_domains[i], inserted_domains[i]); + + printf("[+] Performing lookup benchmarks:\n"); + + start = clock(); + for (i = 0; i < QUERIED_DOMAINS_SIZE; ++i) + verification_new[i] = find_dlt(head, queried_domains[i]); + new_time = clock() - start; + printf(" [*] New method took %.2f seconds.\n", (double)new_time / CLOCKS_PER_SEC); + + start = clock(); + for (i = 0; i < QUERIED_DOMAINS_SIZE; ++i) + verification_old[i] = old_find_dlt(queried_domains[i], inserted_domains, INSERTED_DOMAINS_SIZE); + old_time = clock() - start; + printf(" [*] Old method took %.2f seconds.\n", (double)old_time / CLOCKS_PER_SEC); + + printf(" [*] The new method is %f times faster than the old method.\n", (double)old_time / (double)new_time); + + printf("[+] Verifying that new and old methods produced identical results:\n"); + for (i = 0; i < QUERIED_DOMAINS_SIZE; ++i) { + if (verification_old[i] != verification_new[i]) { + printf(" [!] New and old method produced different results!\n"); + return 1; + } + } + printf(" [*] New and old methods produced the same results.\n"); + return 0; +} diff --git a/domain-lookup.c b/domain-lookup.c new file mode 100644 index 0000000..2adfe8d --- /dev/null +++ b/domain-lookup.c @@ -0,0 +1,158 @@ +/* domain-lookup.c is Copyright (c) 2013 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 dated June, 1991, or + (at your option) version 3 dated 29 June, 2007. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + + + this.is.how.domains.are.stored.com + seeing.how.domains.are.stored.com + is.very.interesting.com + + [head] + | + com + /\ + stored interesting + / \ + are very + / \ + domains is + / + how + /\ + is seeing + / + this + +*/ + + +#include "domain-lookup.h" +#include <string.h> +#include <stdlib.h> + +/* Single internal function for walking, defined bellow. */ +static struct domain_lookup_tree* walk_dlt(struct domain_lookup_tree *head, const char *domain_immutable, int with_data); + +/* The following three functions comprise the entire API. */ +struct domain_lookup_tree* init_dlt() +{ + struct domain_lookup_tree *head = malloc(sizeof(struct domain_lookup_tree)); + if (!head) + return 0; + memset(head, 0, sizeof(struct domain_lookup_tree)); + return head; +} +void* find_dlt(struct domain_lookup_tree *head, const char *domain) +{ + /* If head is non-null, walk_dlt will always return non-null. */ + return walk_dlt(head, domain, 1)->data; +} +int insert_dlt(struct domain_lookup_tree *head, const char *domain, void *data) +{ + head = walk_dlt(head, domain, 0); + if (!head) /* This should only fail when malloc fails. */ + return -1; + head->data = data; + return 0; +} + +/* Where the actual meat happens. This walks down the tree starting at head. + * If with_data is true, it returns the closest matching domain's data that's + * been explicitly inserted. Otherwise, it returns an existing or newly-created + * node for the provided domain. */ +static struct domain_lookup_tree* walk_dlt(struct domain_lookup_tree *head, const char *domain, int with_data) +{ + struct domain_lookup_tree *child, *data_head; + const char *component, *right_dot; + int i, len, component_len; + + /* We immediately return the root node for null, "", and "#". */ + if (!domain) + return head; + len = strlen(domain); + if (!len) + return head; + if (!strcmp(domain, "#")) + return head; + + /* data_head keeps track of the closest match that has been + * explicitly inserted -- which amounts to a node having a non-null + * data member. */ + data_head = head; + + /* The right-most dot starts as the null byte at the end of the string. */ + right_dot = &domain[len]; + + /* Iterate backward through the domain. */ + for (i = len - 1; i >= 0; --i) { + /* We're only interested when we reach a dot or the first character. */ + if (domain[i] != '.' && i) + continue; + + /* If we're at a dot, then our component is everything to the right of the dot. + * Note that this does not overflow, because right_dot is initialized to be + * domain[len], which is the max value of i + 1. */ + if (domain[i] == '.') + component = &domain[i + 1]; + else /* Otherwise, we're at the first character of the domain, and it's not a dot, */ + component = &domain[i]; /* so our component just begins at this index. */ + + /* right_dot - component is the length of the component, up to the previous right dot. */ + component_len = (int)(right_dot - component); + + /* We adjust right_dot to be this latest dot we've found. If domain[i] isn't a dot, + i == 0, so this will be the last iteration anyway. */ + right_dot = &domain[i]; + + /* If the component has no length, we skip it. This automatically elides adjacent dots. */ + if (!component_len) + continue; + + /* We iterate through all the children of the current head looking for a match. */ + for (child = head->first_child; child; child = child->next_sibling) { + /* We want to compare only component_len bytes and also check that they have the same len. */ + if (!strncmp(component, child->component, component_len) && strlen(child->component) == component_len) + break; + } + + if (child) { /* child is non-null in the case where a match is found. */ + /* We update the current head and make sure to update data_head + * if we found a child that was explicitly added. */ + head = child; + if (head->data) + data_head = head; + } else { /* child is null in the case where no matching component was found. */ + /* If we're just searching for the closest explicitly added + * match, return it immediately. */ + if (with_data) + return data_head; + /* Otherwise, create a new child node and insert it. */ + child = init_dlt(); + if (!child) /* init_dlt calls malloc, which can fail. */ + return 0; + /* The same length calculation from before. */ + child->component = strndup(component, component_len); + if (!child->component) /* strdup calls malloc, which can fail. */ + return 0; + /* Insert the new child at the front of the child list for the current head. */ + child->next_sibling = head->first_child; + head->first_child = child; + /* Update the head to the new child. */ + head = child; + } + } + /* If we made it here, we've either matched for every component, or created [a] new child[ren]. + * In either case, we return either data_head or head, depending on which was requested. */ + if (with_data) + return data_head; + return head; +} diff --git a/domain-lookup.h b/domain-lookup.h new file mode 100644 index 0000000..8e45dee --- /dev/null +++ b/domain-lookup.h @@ -0,0 +1,10 @@ +struct domain_lookup_tree { + char *component; + void *data; + struct domain_lookup_tree *first_child; + struct domain_lookup_tree *next_sibling; +}; + +struct domain_lookup_tree* init_dlt(); +void* find_dlt(struct domain_lookup_tree *head, const char *domain); +int insert_dlt(struct domain_lookup_tree *head, const char *domain, void *data); @@ -0,0 +1,36 @@ +#include "domain-lookup.h" +#include <stdio.h> + +#define print_lookup(head, domain) printf("%s: %s\n", domain, (char *)find_dlt(head, domain)) + +void main() +{ + struct domain_lookup_tree *head = init_dlt(); + insert_dlt(head, "zx2c4.com", "zx2c4 root node"); + insert_dlt(head, ".data.zx2c4.com", "zx2c4 data node"); + insert_dlt(head, "co.uk.", "co.uk root node"); + insert_dlt(head, "yahoo.co.uk", "yahoo.co.uk node"); + insert_dlt(head, "yahoo.co.uk", "yahoo.co.uk node overwrite"); + insert_dlt(head, "something.co.uk", "something town, london"); + insert_dlt(head, "yonder.co.uk", "yonder town, manchester"); + insert_dlt(head, "#", "null node"); + + print_lookup(head, "blog.zx2c4.com"); + print_lookup(head, "data.zx2c4.com"); + print_lookup(head, "dat.zx2c4.com"); + print_lookup(head, "zx2c4.com"); + print_lookup(head, "blala.asdf.adsf.adsf.data.zx2c4.com"); + print_lookup(head, ""); + print_lookup(head, "yahoo.com."); + print_lookup(head, ".news.yahoo.co.uk."); + print_lookup(head, "british.co.uk"); + print_lookup(head, "other.com"); + print_lookup(head, "test.jj.uk"); + print_lookup(head, "tasdf.asdf.jj.uk."); + print_lookup(head, "yahoo.co.uk.bad"); + print_lookup(head, "..yahoo.co.uk"); + print_lookup(head, ".ahaan....data...zx2c4.com"); + print_lookup(head, "data..zx2c4.com..."); + print_lookup(head, "asdf.yonder.co.uk"); + print_lookup(head, "ananana.something.co.uk."); +} |