phash.ph 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. # -*- perl -*-
  2. #
  3. # Perfect Minimal Hash Generator written in Perl, which produces
  4. # C output.
  5. #
  6. require 'random_sv_vectors.ph';
  7. require 'crc64.ph';
  8. #
  9. # Compute the prehash for a key
  10. #
  11. # prehash(key, sv, N)
  12. #
  13. sub prehash($$$) {
  14. my($key, $n, $sv) = @_;
  15. my @c = crc64($sv, $key);
  16. # Create a bipartite graph...
  17. $k1 = (($c[1] & ($n-1)) << 1) + 0; # low word
  18. $k2 = (($c[0] & ($n-1)) << 1) + 1; # high word
  19. return ($k1, $k2);
  20. }
  21. #
  22. # Walk the assignment graph, return true on success
  23. #
  24. sub walk_graph($$$$) {
  25. my($nodeval,$nodeneigh,$n,$v) = @_;
  26. my $nx;
  27. # print STDERR "Vertex $n value $v\n";
  28. $$nodeval[$n] = $v;
  29. foreach $nx (@{$$nodeneigh[$n]}) {
  30. # $nx -> [neigh, hash]
  31. my ($o, $e) = @$nx;
  32. # print STDERR "Edge $n,$o value $e: ";
  33. my $ov;
  34. if (defined($ov = $$nodeval[$o])) {
  35. if ($v+$ov != $e) {
  36. # Cyclic graph with collision
  37. # print STDERR "error, should be ", $v+$ov, "\n";
  38. return 0;
  39. } else {
  40. # print STDERR "ok\n";
  41. }
  42. } else {
  43. return 0 unless (walk_graph($nodeval, $nodeneigh, $o, $e-$v));
  44. }
  45. }
  46. return 1;
  47. }
  48. #
  49. # Generate the function assuming a given N.
  50. #
  51. # gen_hash_n(N, sv, \%data, run)
  52. #
  53. sub gen_hash_n($$$$) {
  54. my($n, $sv, $href, $run) = @_;
  55. my @keys = keys(%{$href});
  56. my $i, $sv;
  57. my $gr;
  58. my $k, $v;
  59. my $gsize = 2*$n;
  60. my @nodeval;
  61. my @nodeneigh;
  62. my %edges;
  63. for ($i = 0; $i < $gsize; $i++) {
  64. $nodeneigh[$i] = [];
  65. }
  66. %edges = ();
  67. foreach $k (@keys) {
  68. my ($pf1, $pf2) = prehash($k, $n, $sv);
  69. ($pf1,$pf2) = ($pf2,$pf1) if ($pf1 > $pf2); # Canonicalize order
  70. my $pf = "$pf1,$pf2";
  71. my $e = ${$href}{$k};
  72. my $xkey;
  73. if (defined($xkey = $edges{$pf})) {
  74. next if ($e == ${$href}{$xkey}); # Duplicate hash, safe to ignore
  75. if (defined($run)) {
  76. print STDERR "$run: Collision: $pf: $k with $xkey\n";
  77. }
  78. return;
  79. }
  80. # print STDERR "Edge $pf value $e from $k\n";
  81. $edges{$pf} = $k;
  82. push(@{$nodeneigh[$pf1]}, [$pf2, $e]);
  83. push(@{$nodeneigh[$pf2]}, [$pf1, $e]);
  84. }
  85. # Now we need to assign values to each vertex, so that for each
  86. # edge, the sum of the values for the two vertices give the value
  87. # for the edge (which is our hash index.) If we find an impossible
  88. # sitation, the graph was cyclic.
  89. @nodeval = (undef) x $gsize;
  90. for ($i = 0; $i < $gsize; $i++) {
  91. if (scalar(@{$nodeneigh[$i]})) {
  92. # This vertex has neighbors (is used)
  93. if (!defined($nodeval[$i])) {
  94. # First vertex in a cluster
  95. unless (walk_graph(\@nodeval, \@nodeneigh, $i, 0)) {
  96. if (defined($run)) {
  97. print STDERR "$run: Graph is cyclic\n";
  98. }
  99. return;
  100. }
  101. }
  102. }
  103. }
  104. # for ($i = 0; $i < $n; $i++) {
  105. # print STDERR "Vertex ", $i, ": ", $g[$i], "\n";
  106. # }
  107. if (defined($run)) {
  108. printf STDERR "$run: Done: n = $n, sv = [0x%08x, 0x%08x]\n",
  109. $$sv[0], $$sv[1];
  110. }
  111. return ($n, $sv, \@nodeval);
  112. }
  113. #
  114. # Driver for generating the function
  115. #
  116. # gen_perfect_hash(\%data)
  117. #
  118. sub gen_perfect_hash($) {
  119. my($href) = @_;
  120. my @keys = keys(%{$href});
  121. my @hashinfo;
  122. my $n, $i, $j, $sv, $maxj;
  123. my $run = 1;
  124. # Minimal power of 2 value for N with enough wiggle room.
  125. # The scaling constant must be larger than 0.5 in order for the
  126. # algorithm to ever terminate.
  127. my $room = int(scalar(@keys)*0.8);
  128. $n = 1;
  129. while ($n < $room) {
  130. $n <<= 1;
  131. }
  132. # Number of times to try...
  133. $maxj = scalar @random_sv_vectors;
  134. for ($i = 0; $i < 4; $i++) {
  135. printf STDERR "%d vectors, trying n = %d...\n",
  136. scalar @keys, $n;
  137. for ($j = 0; $j < $maxj; $j++) {
  138. $sv = $random_sv_vectors[$j];
  139. @hashinfo = gen_hash_n($n, $sv, $href, $run++);
  140. return @hashinfo if (@hashinfo);
  141. }
  142. $n <<= 1;
  143. }
  144. return;
  145. }
  146. #
  147. # Verify that the hash table is actually correct...
  148. #
  149. sub verify_hash_table($$)
  150. {
  151. my ($href, $hashinfo) = @_;
  152. my ($n, $sv, $g) = @{$hashinfo};
  153. my $k;
  154. my $err = 0;
  155. foreach $k (keys(%$href)) {
  156. my ($pf1, $pf2) = prehash($k, $n, $sv);
  157. my $g1 = ${$g}[$pf1];
  158. my $g2 = ${$g}[$pf2];
  159. if ($g1+$g2 != ${$href}{$k}) {
  160. printf STDERR "%s(%d,%d): %d+%d = %d != %d\n",
  161. $k, $pf1, $pf2, $g1, $g2, $g1+$g2, ${$href}{$k};
  162. $err = 1;
  163. } else {
  164. # printf STDERR "%s: %d+%d = %d ok\n",
  165. # $k, $g1, $g2, $g1+$g2;
  166. }
  167. }
  168. die "$0: hash validation error\n" if ($err);
  169. }
  170. 1;