# HG changeset patch # User Mahlon E. Smith # Date 1494631061 25200 # Node ID e135ccae6783300d059f5cb62ee6fc615000ea5e # Parent a38e6916504c0dc677b4c8852eda9b597defaf95 Migrate hashing functions to C. diff -r a38e6916504c -r e135ccae6783 .hgignore --- a/.hgignore Fri May 12 11:09:36 2017 -0700 +++ b/.hgignore Fri May 12 16:17:41 2017 -0700 @@ -1,4 +1,4 @@ Session.vim pkg/* docs/* - +tmp/* diff -r a38e6916504c -r e135ccae6783 README.md --- a/README.md Fri May 12 11:09:36 2017 -0700 +++ b/README.md Fri May 12 16:17:41 2017 -0700 @@ -25,7 +25,7 @@ *Strong recommendation*: Create your lists with archiving (-a) and indexing (-i)! This library is suitable for modifying behavior of existing lists as a default, but with these flags enabled, can also -be an interface to parsing and browsing the content of lists. +be a generic interface for parsing and browsing list content. ## Prerequisites @@ -67,6 +67,18 @@ of miscellaneous things included. +## Acknowledgments + +Portions of this library are copied from ezmlm-idx source, authored by +the following: + + * D. J. Bernstein + * Bruce Guenter + +Many thanks for Dan and Bruce for their commitment for fine software, and +contributions to the internet communities over the years. + + ## License Copyright (c) 2017, Mahlon E. Smith diff -r a38e6916504c -r e135ccae6783 Rakefile --- a/Rakefile Fri May 12 11:09:36 2017 -0700 +++ b/Rakefile Fri May 12 16:17:41 2017 -0700 @@ -3,6 +3,12 @@ require 'pathname' +begin + require 'rake/extensiontask' +rescue LoadError + abort "This Rakefile requires rake-compiler (gem install rake-compiler)" +end + PROJECT = 'ezmlm' BASEDIR = Pathname.new( __FILE__ ).expand_path.dirname.relative_path_from( Pathname.getwd ) LIBDIR = BASEDIR + 'lib' @@ -49,6 +55,7 @@ (The -idx provides an extended feature set over the original ezmlm environment.) EOF + s.extensions = FileList[ "ext/**/extconf.rb" ] s.required_ruby_version = '>= 2.1' s.add_dependency 'mail', "~> 2.6" @@ -60,6 +67,9 @@ pkg.need_tar = true end +# Build the C extension for hashing addresses. +Rake::ExtensionTask.new( 'ezmlm/hash', spec ) + ######################################################################## ### D O C U M E N T A T I O N @@ -74,7 +84,7 @@ rdoc.rdoc_dir = 'docs' rdoc.main = "README.rdoc" # rdoc.options = [ '-f', 'fivefish' ] - rdoc.rdoc_files = [ 'lib', *FileList['*.rdoc'] ] + rdoc.rdoc_files = [ 'lib', *FileList['ext/*/*.c'], *FileList['*.rdoc'] ] end RDoc::Task.new do |rdoc| @@ -116,6 +126,12 @@ ### M A N I F E S T ######################################################################## __END__ +ext/ezmlm/hash/hash.c +ext/ezmlm/hash/hash.h +ext/ezmlm/hash/extconf.rb lib/ezmlm/list.rb +lib/ezmlm/list/message.rb +lib/ezmlm/list/thread.rb +lib/ezmlm/list/author.rb lib/ezmlm.rb diff -r a38e6916504c -r e135ccae6783 experiments/surf.pdf Binary file experiments/surf.pdf has changed diff -r a38e6916504c -r e135ccae6783 ext/ezmlm/hash/extconf.rb --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ext/ezmlm/hash/extconf.rb Fri May 12 16:17:41 2017 -0700 @@ -0,0 +1,5 @@ + +require 'mkmf' + +create_makefile( 'ezmlm/hash' ) + diff -r a38e6916504c -r e135ccae6783 ext/ezmlm/hash/hash.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ext/ezmlm/hash/hash.c Fri May 12 16:17:41 2017 -0700 @@ -0,0 +1,193 @@ + +#include "hash.h" + +/* + * I originally attemped to just convert surf.c to pure Ruby, but I + * confess a lack of understanding surrounding the char casts from + * unsigned ints, etc, and screwing up a hash algo doesn't do anyone + * any good, least of all, me. In other words, I don't have to fully + * understand DJB code to trust in it. :-) + * + * The following is copied verbatim from the ezmlm-idx source, version + * 7.2.2. See: subhash.c, surf.c, and surfpcs.c. + * +*/ + +void surf(unsigned int out[8],const unsigned int in[12],const unsigned int seed[32]) +{ + unsigned int t[12]; unsigned int x; unsigned int sum = 0; + int r; int i; int loop; + + for (i = 0;i < 12;++i) t[i] = in[i] ^ seed[12 + i]; + for (i = 0;i < 8;++i) out[i] = seed[24 + i]; + x = t[11]; + for (loop = 0;loop < 2;++loop) { + for (r = 0;r < 16;++r) { + sum += 0x9e3779b9; + MUSH(0,5) MUSH(1,7) MUSH(2,9) MUSH(3,13) + MUSH(4,5) MUSH(5,7) MUSH(6,9) MUSH(7,13) + MUSH(8,5) MUSH(9,7) MUSH(10,9) MUSH(11,13) + } + for (i = 0;i < 8;++i) out[i] ^= t[i + 4]; + } +} + +void surfpcs_init(surfpcs *s,const unsigned int k[32]) +{ + int i; + for (i = 0;i < 32;++i) s->seed[i] = k[i]; + for (i = 0;i < 8;++i) s->sum[i] = 0; + for (i = 0;i < 12;++i) s->in[i] = 0; + s->todo = 0; +} + +void surfpcs_add(surfpcs *s,const char *x,unsigned int n) +{ + int i; + while (n--) { + data[end[s->todo++]] = *x++; + if (s->todo == 32) { + s->todo = 0; + if (!++s->in[8]) + if (!++s->in[9]) + if (!++s->in[10]) + ++s->in[11]; + surf(s->out,s->in,s->seed); + for (i = 0;i < 8;++i) + s->sum[i] += s->out[i]; + } + } +} + +void surfpcs_addlc(surfpcs *s,const char *x,unsigned int n) +/* modified from surfpcs_add by case-independence and skipping ' ' & '\t' */ +{ + unsigned char ch; + int i; + while (n--) { + ch = *x++; + if (ch == ' ' || ch == '\t') continue; + if (ch >= 'A' && ch <= 'Z') + ch -= 'a' - 'A'; + + data[end[s->todo++]] = ch; + if (s->todo == 32) { + s->todo = 0; + if (!++s->in[8]) + if (!++s->in[9]) + if (!++s->in[10]) + ++s->in[11]; + surf(s->out,s->in,s->seed); + for (i = 0;i < 8;++i) + s->sum[i] += s->out[i]; + } + } +} + +void surfpcs_out(surfpcs *s,unsigned char h[32]) +{ + int i; + surfpcs_add(s,".",1); + while (s->todo) surfpcs_add(s,"",1); + for (i = 0;i < 8;++i) s->in[i] = s->sum[i]; + for (;i < 12;++i) s->in[i] = 0; + surf(s->out,s->in,s->seed); + for (i = 0;i < 32;++i) h[i] = outdata[end[i]]; +} + +void makehash(const char *indata,unsigned int inlen,char *hash) + /* makes hash[COOKIE=20] from stralloc *indata, ignoring case and */ + /* SPACE/TAB */ +{ + unsigned char h[32]; + surfpcs s; + unsigned int seed[32]; + int i; + + for (i = 0;i < 32;++i) seed[i] = 0; + surfpcs_init(&s,seed); + surfpcs_addlc(&s,indata,inlen); + surfpcs_out(&s,h); + for (i = 0;i < 20;++i) + hash[i] = 'a' + (h[i] & 15); +} + +unsigned int subhashb(const char *s,long len) +{ + unsigned long h; + h = 5381; + while (len-- > 0) + h = (h + (h << 5)) ^ (unsigned int)*s++; + return h % 53; +} + +unsigned int subhashs(const char *s) +{ + return subhashb(s,strlen(s)); +} + +/* end copy of ezmlm-idx source */ + + + + +/* + * call­seq: + * Ezmlm::Hash.address( email ) -> String + * + * Call the Surf hashing function on an +email+ address, returning + * the hashed string. This is specific to how ezmlm is seeding + * the hash, and parsing email addresses from messages (prefixed with + * the '<' character.) + * + */ +VALUE +address( VALUE klass, VALUE email ) { + char hash[20]; + char *input; + + Check_Type( email, T_STRING ); + + email = rb_str_plus( rb_str_new2("<"), email); + input = StringValueCStr( email ); + + makehash( input, strlen(input), hash ); + + return rb_str_new2( hash ); +} + + +/* + * call­seq: + * Ezmlm::Hash.subscriber( address ) -> String + * + * Call the subscriber hashing function on an email +address+, returning + * the index character referring to the file containing subscriber presence. + * + */ +VALUE +subscriber( VALUE klass, VALUE email ) { + unsigned int prefix; + + Check_Type( email, T_STRING ); + + email = rb_str_plus( rb_str_new2("T"), email); + prefix = subhashs( StringValueCStr(email) ) + 64; + + return rb_sprintf( "%c", (char)prefix ); +} + + + +void +Init_hash() +{ + rb_mEzmlm = rb_define_module( "Ezmlm" ); + rb_cEZHash = rb_define_class_under( rb_mEzmlm, "Hash", rb_cObject ); + + rb_define_module_function( rb_cEZHash, "address", address, 1 ); + rb_define_module_function( rb_cEZHash, "subscriber", subscriber, 1 ); + + return; +} + diff -r a38e6916504c -r e135ccae6783 ext/ezmlm/hash/hash.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ext/ezmlm/hash/hash.h Fri May 12 16:17:41 2017 -0700 @@ -0,0 +1,46 @@ + +#include + +static VALUE rb_mEzmlm; +static VALUE rb_cEZHash; + + +#ifndef SURF_H +#define SURF_H + +#define ROTATE(x,b) (((x) << (b)) | ((x) >> (32 - (b)))) +#define MUSH(i,b) x = t[i] += (((x ^ seed[i]) + sum) ^ ROTATE(x,b)); + +typedef struct { + unsigned int seed[32]; + unsigned int sum[8]; + unsigned int out[8]; + unsigned int in[12]; + int todo; +} surfpcs; + +static const unsigned int littleendian[8] = { + 0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c, + 0x13121110, 0x17161514, 0x1b1a1918, 0x1f1e1d1c +} ; +#define end ((unsigned char *) &littleendian) +#define data ((unsigned char *) s->in) +#define outdata ((unsigned char *) s->out) + +extern void surf( unsigned int out[8], const unsigned int in[12], const unsigned int seed[32] ); +extern void surfpcs_init( surfpcs *s, const unsigned int k[32] ); +extern void surfpcs_add( surfpcs *s, const char *x,unsigned int n ); +extern void surfpcs_addlc( surfpcs *s, const char *x,unsigned int n ); +extern void surfpcs_out( surfpcs *s, unsigned char h[32] ); +#endif + + +#ifndef SUBHASH_H +#define SUBHASH_H + +unsigned int subhashs(const char *s); +unsigned int subhashb(const char *s,long len); +#define subhashsa(SA) subhashb((SA)->s,(SA)->len) + +#endif + diff -r a38e6916504c -r e135ccae6783 lib/ezmlm.rb --- a/lib/ezmlm.rb Fri May 12 11:09:36 2017 -0700 +++ b/lib/ezmlm.rb Fri May 12 16:17:41 2017 -0700 @@ -23,6 +23,7 @@ # Suck in the components. # + require 'ezmlm/hash' require 'ezmlm/list' require 'ezmlm/list/author' require 'ezmlm/list/message' diff -r a38e6916504c -r e135ccae6783 lib/ezmlm/list.rb --- a/lib/ezmlm/list.rb Fri May 12 11:09:36 2017 -0700 +++ b/lib/ezmlm/list.rb Fri May 12 16:17:41 2017 -0700 @@ -19,10 +19,6 @@ ### class Ezmlm::List - # Quick address space detection, to (hopefully) - # match the overflow size on this machine. - ADDRESS_SPACE = [ 'i' ].pack( 'p' ).size * 8 - # Valid subdirectories/sections for subscriptions. SUBSCRIPTION_DIRS = %w[ deny mod digest allow ] @@ -75,7 +71,7 @@ ### def include?( addr, section: nil ) addr.downcase! - file = self.subscription_dir( section ) + self.hashchar( addr ) + file = self.subscription_dir( section ) + Ezmlm::Hash.subscriber( addr ) return false unless file.exist? return file.read.scan( /T([^\0]+)\0/ ).flatten.include?( addr ) end @@ -97,7 +93,7 @@ next unless address.index( '@' ) address.downcase! - file = self.subscription_dir( section ) + self.hashchar( address ) + file = self.subscription_dir( section ) + Ezmlm::Hash.subscriber( address ) self.with_safety do if file.exist? addresses = file.read.scan( /T([^\0]+)\0/ ).flatten @@ -123,7 +119,7 @@ addr.each do |address| address.downcase! - file = self.subscription_dir( section ) + self.hashchar( address ) + file = self.subscription_dir( section ) + Ezmlm::Hash.subscriber( address ) self.with_safety do next unless file.exist? addresses = file.read.scan( /T([^\0]+)\0/ ).flatten @@ -680,10 +676,12 @@ end - ### Return an Author object for the given +author_id+. + ### Return an Author object for the given +author_id+, which + ### could also be an email address. ### def author( author_id ) raise "Archiving is not enabled." unless self.archived? + author_id = Ezmlm::Hash.address(author_id) if author_id.index( '@' ) return Ezmlm::List::Author.new( self, author_id ) end @@ -729,34 +727,6 @@ protected ######### - ### Hash an email address, using the ezmlm algorithm for - ### fast user lookups. Returns the hashed integer. - ### - ### Older ezmlm didn't lowercase addresses, anything within the last - ### decade did. We're not going to worry about compatibility there. - ### - ### See: subhash.c in the ezmlm source. - ### - def subhash( addr ) - h = 5381 - over = 2 ** ADDRESS_SPACE - - addr = 'T' + addr.downcase - addr.each_char do |c| - h = ( h + ( h << 5 ) ) ^ c.ord - h = h % over if h > over # emulate integer overflow - end - return h % 53 - end - - - ### Given an email address, return the ascii hash prefix. - ### - def hashchar( addr ) - return ( self.subhash(addr) + 64 ).chr - end - - ### Just return the contents of the provided +file+, rooted ### in the list directory. ### diff -r a38e6916504c -r e135ccae6783 spec/ezmlm/list_spec.rb --- a/spec/ezmlm/list_spec.rb Fri May 12 11:09:36 2017 -0700 +++ b/spec/ezmlm/list_spec.rb Fri May 12 16:17:41 2017 -0700 @@ -341,6 +341,11 @@ expect( list.author('ojjhjlapnejjlbcplabi') ).to be_a( Ezmlm::List::Author ) end + it 'fetches author objects by email address' do + author = list.author( 'ojjhjlapnejjlbcplabi' ) + expect( list.author('yvette@example.net').name ).to eq( author.name ) + end + context 'fetching messages' do it 'raises an error if archiving is disabled' do