Migrate hashing functions to C.
authorMahlon E. Smith <mahlon@laika.com>
Fri, 12 May 2017 16:17:41 -0700
changeset 16 e135ccae6783
parent 15 a38e6916504c
child 17 23c7f5c8ee39
Migrate hashing functions to C.
.hgignore
README.md
Rakefile
experiments/surf.pdf
ext/ezmlm/hash/extconf.rb
ext/ezmlm/hash/hash.c
ext/ezmlm/hash/hash.h
lib/ezmlm.rb
lib/ezmlm/list.rb
spec/ezmlm/list_spec.rb
--- a/.hgignore	Fri May 12 11:09:36 2017 -0700
+++ b/.hgignore	Fri May 12 16:17:41 2017 -0700
@@ -1,4 +1,4 @@
 Session.vim
 pkg/*
 docs/*
-
+tmp/*
--- a/README.md	Fri May 12 11:09:36 2017 -0700
+++ b/README.md	Fri May 12 16:17:41 2017 -0700
@@ -25,7 +25,7 @@
 *Strong recommendation*: Create your lists with archiving (-a) and
 indexing (-i)!   This library is suitable for modifying behavior of
 existing lists as a default, but with these flags enabled, can also
-be an interface to parsing and browsing the content of lists.
+be a generic interface for parsing and browsing list content.
 
 
 ## Prerequisites
@@ -67,6 +67,18 @@
 of miscellaneous things included.
 
 
+## Acknowledgments
+
+Portions of this library are copied from ezmlm-idx source, authored by
+the following:
+
+ * D. J. Bernstein <djb@cr.yp.to>
+ * Bruce Guenter <bruce@untroubled.org>
+
+Many thanks for Dan and Bruce for their commitment for fine software, and
+contributions to the internet communities over the years.
+
+
 ## License
 
 Copyright (c) 2017, Mahlon E. Smith <mahlon@martini.nu>
--- a/Rakefile	Fri May 12 11:09:36 2017 -0700
+++ b/Rakefile	Fri May 12 16:17:41 2017 -0700
@@ -3,6 +3,12 @@
 
 require 'pathname'
 
+begin
+	require 'rake/extensiontask'
+rescue LoadError
+	abort "This Rakefile requires rake-compiler (gem install rake-compiler)"
+end
+
 PROJECT = 'ezmlm'
 BASEDIR = Pathname.new( __FILE__ ).expand_path.dirname.relative_path_from( Pathname.getwd )
 LIBDIR  = BASEDIR + 'lib'
@@ -49,6 +55,7 @@
 (The -idx provides an extended feature set over the original ezmlm
 environment.)
 	EOF
+	s.extensions = FileList[ "ext/**/extconf.rb" ]
 	s.required_ruby_version = '>= 2.1'
 
 	s.add_dependency 'mail', "~> 2.6"
@@ -60,6 +67,9 @@
 	pkg.need_tar = true
 end
 
+# Build the C extension for hashing addresses.
+Rake::ExtensionTask.new( 'ezmlm/hash', spec )
+
 
 ########################################################################
 ### D O C U M E N T A T I O N
@@ -74,7 +84,7 @@
 		rdoc.rdoc_dir   = 'docs'
 		rdoc.main       = "README.rdoc"
 		# rdoc.options    = [ '-f', 'fivefish' ]
-		rdoc.rdoc_files = [ 'lib', *FileList['*.rdoc'] ]
+		rdoc.rdoc_files = [ 'lib', *FileList['ext/*/*.c'], *FileList['*.rdoc'] ]
 	end
 
 	RDoc::Task.new do |rdoc|
@@ -116,6 +126,12 @@
 ### M A N I F E S T
 ########################################################################
 __END__
+ext/ezmlm/hash/hash.c
+ext/ezmlm/hash/hash.h
+ext/ezmlm/hash/extconf.rb
 lib/ezmlm/list.rb
+lib/ezmlm/list/message.rb
+lib/ezmlm/list/thread.rb
+lib/ezmlm/list/author.rb
 lib/ezmlm.rb
 
Binary file experiments/surf.pdf has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ext/ezmlm/hash/extconf.rb	Fri May 12 16:17:41 2017 -0700
@@ -0,0 +1,5 @@
+
+require 'mkmf'
+
+create_makefile( 'ezmlm/hash' )
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ext/ezmlm/hash/hash.c	Fri May 12 16:17:41 2017 -0700
@@ -0,0 +1,193 @@
+
+#include "hash.h"
+
+/*
+ * I originally attemped to just convert surf.c to pure Ruby, but I
+ * confess a lack of understanding surrounding the char casts from
+ * unsigned ints, etc, and screwing up a hash algo doesn't do anyone
+ * any good, least of all, me.  In other words, I don't have to fully
+ * understand DJB code to trust in it. :-)
+ *
+ * The following is copied verbatim from the ezmlm-idx source, version
+ * 7.2.2.  See: subhash.c, surf.c, and surfpcs.c.
+ *
+*/
+
+void surf(unsigned int out[8],const unsigned int in[12],const unsigned int seed[32])
+{
+  unsigned int t[12]; unsigned int x; unsigned int sum = 0;
+  int r; int i; int loop;
+
+  for (i = 0;i < 12;++i) t[i] = in[i] ^ seed[12 + i];
+  for (i = 0;i < 8;++i) out[i] = seed[24 + i];
+  x = t[11];
+  for (loop = 0;loop < 2;++loop) {
+    for (r = 0;r < 16;++r) {
+      sum += 0x9e3779b9;
+      MUSH(0,5) MUSH(1,7) MUSH(2,9) MUSH(3,13)
+      MUSH(4,5) MUSH(5,7) MUSH(6,9) MUSH(7,13)
+      MUSH(8,5) MUSH(9,7) MUSH(10,9) MUSH(11,13)
+    }
+    for (i = 0;i < 8;++i) out[i] ^= t[i + 4];
+  }
+}
+
+void surfpcs_init(surfpcs *s,const unsigned int k[32])
+{
+  int i;
+  for (i = 0;i < 32;++i) s->seed[i] = k[i];
+  for (i = 0;i < 8;++i) s->sum[i] = 0;
+  for (i = 0;i < 12;++i) s->in[i] = 0;
+  s->todo = 0;
+}
+
+void surfpcs_add(surfpcs *s,const char *x,unsigned int n)
+{
+  int i;
+  while (n--) {
+    data[end[s->todo++]] = *x++;
+    if (s->todo == 32) {
+      s->todo = 0;
+      if (!++s->in[8])
+        if (!++s->in[9])
+          if (!++s->in[10])
+            ++s->in[11];
+      surf(s->out,s->in,s->seed);
+      for (i = 0;i < 8;++i)
+	s->sum[i] += s->out[i];
+    }
+  }
+}
+
+void surfpcs_addlc(surfpcs *s,const char *x,unsigned int n)
+/* modified from surfpcs_add by case-independence and skipping ' ' & '\t' */
+{
+  unsigned char ch;
+  int i;
+  while (n--) {
+    ch = *x++;
+    if (ch == ' ' || ch == '\t') continue;
+    if (ch >= 'A' && ch <= 'Z')
+      ch -= 'a' - 'A';
+
+    data[end[s->todo++]] = ch;
+    if (s->todo == 32) {
+      s->todo = 0;
+      if (!++s->in[8])
+        if (!++s->in[9])
+          if (!++s->in[10])
+            ++s->in[11];
+      surf(s->out,s->in,s->seed);
+      for (i = 0;i < 8;++i)
+	  s->sum[i] += s->out[i];
+    }
+  }
+}
+
+void surfpcs_out(surfpcs *s,unsigned char h[32])
+{
+  int i;
+  surfpcs_add(s,".",1);
+  while (s->todo) surfpcs_add(s,"",1);
+  for (i = 0;i < 8;++i) s->in[i] = s->sum[i];
+  for (;i < 12;++i) s->in[i] = 0;
+  surf(s->out,s->in,s->seed);
+  for (i = 0;i < 32;++i) h[i] = outdata[end[i]];
+}
+
+void makehash(const char *indata,unsigned int inlen,char *hash)
+	/* makes hash[COOKIE=20] from stralloc *indata, ignoring case and */
+	/* SPACE/TAB */
+{
+  unsigned char h[32];
+  surfpcs s;
+  unsigned int seed[32];
+  int i;
+
+  for (i = 0;i < 32;++i) seed[i] = 0;
+  surfpcs_init(&s,seed);
+  surfpcs_addlc(&s,indata,inlen);
+  surfpcs_out(&s,h);
+  for (i = 0;i < 20;++i)
+    hash[i] = 'a' + (h[i] & 15);
+}
+
+unsigned int subhashb(const char *s,long len)
+{
+  unsigned long h;
+  h = 5381;
+  while (len-- > 0)
+    h = (h + (h << 5)) ^ (unsigned int)*s++;
+  return h % 53;
+}
+
+unsigned int subhashs(const char *s)
+{
+  return subhashb(s,strlen(s));
+}
+
+/* end copy of ezmlm-idx source */
+
+
+
+
+/*
+ * call­seq:
+ *   Ezmlm::Hash.address( email ) -> String
+ *
+ * Call the Surf hashing function on an +email+ address, returning
+ * the hashed string.  This is specific to how ezmlm is seeding
+ * the hash, and parsing email addresses from messages (prefixed with
+ * the '<' character.)
+ *
+ */
+VALUE
+address( VALUE klass, VALUE email ) {
+	char hash[20];
+	char *input;
+
+	Check_Type( email, T_STRING );
+
+	email = rb_str_plus( rb_str_new2("<"), email);
+	input = StringValueCStr( email );
+
+	makehash( input, strlen(input), hash );
+
+	return rb_str_new2( hash );
+}
+
+
+/*
+ * call­seq:
+ *   Ezmlm::Hash.subscriber( address ) -> String
+ *
+ * Call the subscriber hashing function on an email +address+, returning
+ * the index character referring to the file containing subscriber presence.
+ *
+ */
+VALUE
+subscriber( VALUE klass, VALUE email ) {
+	unsigned int prefix;
+
+	Check_Type( email, T_STRING );
+
+	email  = rb_str_plus( rb_str_new2("T"), email);
+	prefix = subhashs( StringValueCStr(email) ) + 64;
+
+	return rb_sprintf( "%c", (char)prefix );
+}
+
+
+
+void
+Init_hash()
+{
+	rb_mEzmlm  = rb_define_module( "Ezmlm" );
+	rb_cEZHash = rb_define_class_under( rb_mEzmlm, "Hash", rb_cObject );
+
+	rb_define_module_function( rb_cEZHash, "address", address, 1 );
+	rb_define_module_function( rb_cEZHash, "subscriber", subscriber, 1 );
+
+	return;
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ext/ezmlm/hash/hash.h	Fri May 12 16:17:41 2017 -0700
@@ -0,0 +1,46 @@
+
+#include <ruby.h>
+
+static VALUE rb_mEzmlm;
+static VALUE rb_cEZHash;
+
+
+#ifndef SURF_H
+#define SURF_H
+
+#define ROTATE(x,b) (((x) << (b)) | ((x) >> (32 - (b))))
+#define MUSH(i,b) x = t[i] += (((x ^ seed[i]) + sum) ^ ROTATE(x,b));
+
+typedef struct {
+	unsigned int seed[32];
+	unsigned int sum[8];
+	unsigned int out[8];
+	unsigned int in[12];
+	int todo;
+} surfpcs;
+
+static const unsigned int littleendian[8] = {
+  0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c,
+  0x13121110, 0x17161514, 0x1b1a1918, 0x1f1e1d1c
+} ;
+#define end ((unsigned char *) &littleendian)
+#define data ((unsigned char *) s->in)
+#define outdata ((unsigned char *) s->out)
+
+extern void surf( unsigned int out[8], const unsigned int in[12], const unsigned int seed[32] );
+extern void surfpcs_init( surfpcs *s, const unsigned int k[32] );
+extern void surfpcs_add( surfpcs *s, const char *x,unsigned int n );
+extern void surfpcs_addlc( surfpcs *s, const char *x,unsigned int n );
+extern void surfpcs_out( surfpcs *s, unsigned char h[32] );
+#endif
+
+
+#ifndef SUBHASH_H
+#define SUBHASH_H
+
+unsigned int subhashs(const char *s);
+unsigned int subhashb(const char *s,long len);
+#define subhashsa(SA) subhashb((SA)->s,(SA)->len)
+
+#endif
+
--- a/lib/ezmlm.rb	Fri May 12 11:09:36 2017 -0700
+++ b/lib/ezmlm.rb	Fri May 12 16:17:41 2017 -0700
@@ -23,6 +23,7 @@
 
 	# Suck in the components.
 	#
+	require 'ezmlm/hash'
 	require 'ezmlm/list'
 	require 'ezmlm/list/author'
 	require 'ezmlm/list/message'
--- a/lib/ezmlm/list.rb	Fri May 12 11:09:36 2017 -0700
+++ b/lib/ezmlm/list.rb	Fri May 12 16:17:41 2017 -0700
@@ -19,10 +19,6 @@
 ###
 class Ezmlm::List
 
-	# Quick address space detection, to (hopefully)
-	# match the overflow size on this machine.
-	ADDRESS_SPACE = [ 'i' ].pack( 'p' ).size * 8
-
 	# Valid subdirectories/sections for subscriptions.
 	SUBSCRIPTION_DIRS = %w[ deny mod digest allow ]
 
@@ -75,7 +71,7 @@
 	###
 	def include?( addr, section: nil )
 		addr.downcase!
-		file = self.subscription_dir( section ) + self.hashchar( addr )
+		file = self.subscription_dir( section ) + Ezmlm::Hash.subscriber( addr )
 		return false unless file.exist?
 		return file.read.scan( /T([^\0]+)\0/ ).flatten.include?( addr )
 	end
@@ -97,7 +93,7 @@
 			next unless address.index( '@' )
 			address.downcase!
 
-			file = self.subscription_dir( section ) + self.hashchar( address )
+			file = self.subscription_dir( section ) + Ezmlm::Hash.subscriber( address )
 			self.with_safety do
 				if file.exist?
 					addresses = file.read.scan( /T([^\0]+)\0/ ).flatten
@@ -123,7 +119,7 @@
 		addr.each do |address|
 			address.downcase!
 
-			file = self.subscription_dir( section ) + self.hashchar( address )
+			file = self.subscription_dir( section ) + Ezmlm::Hash.subscriber( address )
 			self.with_safety do
 				next unless file.exist?
 				addresses = file.read.scan( /T([^\0]+)\0/ ).flatten
@@ -680,10 +676,12 @@
 	end
 
 
-	### Return an Author object for the given +author_id+.
+	### Return an Author object for the given +author_id+, which
+	### could also be an email address.
 	###
 	def author( author_id )
 		raise "Archiving is not enabled." unless self.archived?
+		author_id = Ezmlm::Hash.address(author_id) if author_id.index( '@' )
 		return Ezmlm::List::Author.new( self, author_id )
 	end
 
@@ -729,34 +727,6 @@
 	protected
 	#########
 
-	### Hash an email address, using the ezmlm algorithm for
-	### fast user lookups.  Returns the hashed integer.
-	###
-	### Older ezmlm didn't lowercase addresses, anything within the last
-	### decade did.  We're not going to worry about compatibility there.
-	###
-	### See: subhash.c in the ezmlm source.
-	###
-	def subhash( addr )
-		h = 5381
-		over = 2 ** ADDRESS_SPACE
-
-		addr = 'T' + addr.downcase
-		addr.each_char do |c|
-			h = ( h + ( h << 5 ) ) ^ c.ord
-			h = h % over if h > over # emulate integer overflow
-		end
-		return h % 53
-	end
-
-
-	### Given an email address, return the ascii hash prefix.
-	###
-	def hashchar( addr )
-		return ( self.subhash(addr) + 64 ).chr
-	end
-
-
 	### Just return the contents of the provided +file+, rooted
 	### in the list directory.
 	###
--- a/spec/ezmlm/list_spec.rb	Fri May 12 11:09:36 2017 -0700
+++ b/spec/ezmlm/list_spec.rb	Fri May 12 16:17:41 2017 -0700
@@ -341,6 +341,11 @@
 		expect( list.author('ojjhjlapnejjlbcplabi') ).to be_a( Ezmlm::List::Author )
 	end
 
+	it 'fetches author objects by email address' do
+		author = list.author( 'ojjhjlapnejjlbcplabi' )
+		expect( list.author('yvette@example.net').name ).to eq( author.name )
+	end
+
 
 	context 'fetching messages' do
 		it 'raises an error if archiving is disabled' do