#!/usr/bin/env perl
#
# symtab - generates a symbol table in C
#
# Copyright (c) 2001 by Marc Tardif
# Distributed under the GPL license agreement; see file COPYING
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
# 02111-1307  USA

=head1 NAME

symtab - generates a symbol table in C

=head1 SYNOPSIS

 symtab [ options ] -A list file1 file2 file3
 symtab [ options ] -A trie -o outfile file1
 symtab [ options ] -A hash <infile >outfile

=head1 DESCRIPTION

Symtab reads a table of tab delimited key/data pairs and generates
a symbol table in C.  This provides a function which takes a key
value as argument and returns the corresponding data value.  If
the given key is not in the symbol table, the function returns -1
or NULL if the data is composed of integers or strings respectively.

Three symbol table mechanisms are provided: list, trie and hash.
Each are described at length along with their advantages and
disadvantages in the following Algorithms section.  Note that the
generated data structures are static, meaning it is not possible
to insert and remove entries from the table after it has been
generated.

=cut

require 5.004;

use strict;
use vars qw( $VERSION );

BEGIN {
  $VERSION = "0.1";
}

sub main {
  use IO::File;

  my $options = get_options();
  my $list = "Symtab::$options->{'algorithm'}"->new;

  # input
  if (defined $options->{'input'}) {
    foreach my $input (@{$options->{'input'}}) {
      $options->{'input-fh'} = IO::File->new("< $input");

      unless ($options->{'input-fh'}) {
	warn 'Could not open file "' . $options->{'input-fh'}
	  . '" for reading, skipping.';
	next;
      }

      parse_input($list, $options);
    }
  }
  else {
    $options->{'input-fh'} = \*STDIN;
    parse_input($list, $options);
  }

  # output
  if (defined $options->{'output'}) {
    $options->{'output-fh'} = IO::File->new("> $options->{'output'}")
      or die "Could not open file \"$options->{'output'}\" for writing.";
  }
  else {
    $options->{'output-fh'} = \*STDOUT;
  }

  print {$options->{'output-fh'}} <<EOS;
/* This file was automatically generated by symtab - $VERSION */

EOS
  $list->output($options);
}

sub parse_input {
  my ($list, $options) = @_;

  my $num = 0;
  while (my $line = readline($options->{'input-fh'})) {
    # ignore comments and empty lines
    next if substr($line, 0, 1) eq '#'
      || substr($line, 0, 1) eq "\n";
    chomp($line);

    # make sure both key and data are present
    my ($key, $data) = split(/ *$options->{'field-separator'} */o, $line);
    if (defined $key && defined $data) {
      $list->insert($options->{'ignore-case'} ? "\L$key" : $key, $data);
    }
    elsif (defined $key) {
      warn "Key \"$key\" has no corresponding data, skipping.";
    }
    else {
      warn "No data found on line $num, skipping.";
    }
    $num++;
  }
}

=head1 OPTIONS

=over 4

=item -A, --algorithm name

Required argument defining the name of algorithm: list, trie or hash.

=item -F, --field-separator string

Use string for the input field separator, defaults to tab.

=item -i, --ignore-case

Ignore case distinctions in both the key and the input.

=item -N, --function-name name

Use name for the function which will be called to access the symbol
table, defaults to "get_symbol".

=item -o, --output file

File where to print output, defaults to stdout.

=item -V, --version

Print version information and exit.

=item --help

Print help information and exit.

=back

=cut

sub get_options {
  use Getopt::Long;

  # defaults
  my %options = (
    'field-separator' => "\t",
    'function-name'   => 'get_symbol',
  );

  GetOptions (\%options,
    'algorithm|A=s',
    'field-separator|F=s',
    'function-name|N=s',
    'ignore-case|i',
    'output|o=s',
    'help',
    'version|V',
  ) or exit 1;

  $options{'version'}
    and show_version();

  $options{'help'}
    and show_usage();

  $options{'algorithm'} =~ /^(list|trie|hash)$/i
    ? ($options{'algorithm'} = "\u\L$1")
    : show_usage("option 'algorithm' requires value: list, trie or hash");

  @ARGV
    and $options{'input'} = \@ARGV;

  \%options;
}

sub show_version {
  print "$0 $VERSION\n";
  exit 0;
}

sub show_usage {
  my $msg = shift;

  $msg
  ? print STDERR <<USAGE_SHORT
$msg
Usage: $0 [OPTION]... -A [ALGORITHM] [FILE] ...
Try '$0 --help' for more information.
USAGE_SHORT

  : print <<USAGE_LONG
Usage: $0 [OPTION]... -A [ALGORITHM] [FILE] ...
Generate a symbol table in C from the key/data pairs
in FILE(s) using the specified ALGORITHM.

Required:
  -A, --algorithm name
    Name of the algorithm: list, trie or hash

Optional:
  -F, --field-separator fs
    Use  fs for the input field separator, defaults to tab.
  -i, --ignore-case
    Ignore case distinctions in both the key and the input.
  -N, --function-name name
    Use name for the function which will be called to access
    the symbol table, defaults to "get_symbol".
  -o, --output file
    File where to print output, defaults to stdout.

Miscellaneous:
  -V, --version
    Print version information and exit
  --help
    Display this help and exit
USAGE_LONG
  ;

  exit ! !$msg;
}

# parent package

package Symtab;

use constant UCHAR_MAX  => 0xFF;
use constant USHRT_MAX  => 0xFFFF;

sub new {
  my $class = shift;

  bless {
    max_key_len => 0,
    min_key_len => 0xFFFF,
    max_value => 0,
    min_value => 0xFFFF,
    value_is_int => 1,
  }, ref($class) || $class;
}

sub limits {
  my ($self, $key, $value) = @_;

  my $len = length($key);
  $self->{'max_key_len'} < $len and $self->{'max_key_len'} = $len;
  $self->{'min_key_len'} > $len and $self->{'min_key_len'} = $len;

  if ($self->{'value_is_int'}) {
    if ($value =~ /^-?\d+$/) {
      $self->{'max_value'} < $value and $self->{'max_value'} = $value;
      $self->{'min_value'} > $value and $self->{'min_value'} = $value;
    }
    else {
      $self->{'value_is_int'} = 0;
    }
  }
}

sub get_int_type {
  my ($min, $max) = @_;

  my $type = '';

  # set sign
  $min >= 0
    and $type = 'unsigned ';

  # set size
  if ($max < UCHAR_MAX) {
    $type .= 'char';
  }
  elsif ($max < USHRT_MAX) {
    $type .= 'short';
  }
  else {
    $type .= 'int';
  }
  $type;
}

sub type_and_return {
  my $self = shift;

  return $self->{'value_is_int'}
    ? (get_int_type($self->{'min_value'}, $self->{'max_value'}), '-1')
    : ('char *', 'NULL');
}

=head1 ALGORITHMS

=cut

package Symtab::List;

=head2 List

This mechanism first outputs the table of key/data pairs as a sorted
array.  The ordering is determined by the ASCII value of each key
and sorted in descending order.

The function which finds keys in this array is a binary search.
This consists of repeatedly dividing the search space in half
according to how the requested key value compares with the middle
element.  If it is greater, the search space is reduced to the
seconf half of the list.  If it is lesser, to the first half of
the list.  The process is then repeated for the new search space
until a match is found.

 Pros: requires minimal space
 Cons: max running time of log(n)

=cut

use vars qw( @ISA );

BEGIN {
  @ISA = qw( Symtab );
}

sub insert {
  my ($self, $key, $value) = @_;

  $self->limits($key, $value);
  push @{$self->{'elements'}}, [$key, $value];
}

sub output {
  my ($self, $options) = @_;

  # sort list of elements by key
  my @array = sort { $a->[0] cmp $b->[0] } @{$self->{'elements'}};
  @array = $self->{'value_is_int'}
    ? map { "\"$_->[0]\", $_->[1]" } @array
    : map { "\"$_->[0]\", \"$_->[1]\"" } @array;

  my ($data_type, $return_value) = $self->type_and_return;

  $" = "},\n  {";

  print {$options->{'output-fh'}} <<EOS
#include <string.h>

#define MAX_KEY_LEN $self->{'max_key_len'}
#define MIN_KEY_LEN $self->{'min_key_len'}

struct list {
  char * key;
  $data_type data;
} list[] = {
  {@array},
};

$data_type
$options->{'function-name'} (char *str) {
  register int low, high, mid, cmp;

  low = 0;
  high = sizeof list / sizeof (struct list) - 1;
  while (low <= high) {
    mid = (low + high) / 2;
    cmp = strcasecmp(str, list[mid].key);
    if (cmp > 0)
      high = mid - 1;
    else if (cmp < 0)
      low = mid + 1;
    else
      return list[mid].data;
  }
  return $return_value;
}
EOS
}

package Symtab::Trie;

=head2 Trie

This data structure can be represented as a rooted tree.  Each
child of the root corresponds to the first letter of each word.
If two or more words start with the same letter, the same node is
shared.  From each of these nodes spans the second and subsequent
letters of each word.  In the end, each path from the root maps to
a word.

The code produced is composed of embedded 'switch' and 'if'
statements.  Nodes which have one or two children use the conditional
if statement.  Otherwise, the multi-way 'switch' statement is used.

 Pros: determines if a key isn't part of the list the quickest
 Cons: requires the most space

=cut

use vars qw( @ISA );

BEGIN {
  @ISA = qw( Symtab );
}

sub new {
  my $self = Symtab::new(@_);
  $self->{'root'} = {};
  $self;
}

# iteratively insert key/value pairs in a trie
sub insert {
  my ($self, $key, $value) = @_;

  $self->limits($key, $value);

  # build multi-way decision tree with hashes
  my $tree = $self->{'root'};
  foreach my $c ((map { ord } split('', $key)), 0) {
    unless (exists $tree->{$c}) {
      $tree->{$c} = {};
    }
    $tree = $tree->{$c};
  }
  $tree->{'_value'} = $value;
}

sub _build_trie {
  my ($self, $tree, $depth, $options) = @_;

  my @nodes = keys %{$tree};
  my $indent = '  ' x $depth;

  # declare switch statement if current branch contains
  # multiple immediate children
  my $str = @nodes > 1
    ? "${indent}switch (str[$depth]) {\n"
    : '';

  foreach (@nodes) {
    # return key value if at leaf
    if ($_ eq '_value') {
      $self->{'value_is_int'}
	? ($str .= "${indent}return $tree->{$_};\n")
	: ($str .= "${indent}return \"$tree->{$_}\";\n");
    }
    # otherwise open if or case statement
    else {
      my $c = $_ ? chr : '\\0';
      if (@nodes > 1) {
	$str .= "${indent}case '$c':\n";
	$options->{'ignore-case'} && $c ne "\u$c"
	  and $str .= "${indent}case '\u$c':\n"
      }
      # embed if statements while single branches
      else {
	my $d = $depth;
	$str .= "${indent}if ((str[$d] == '$c'";
	$options->{'ignore-case'} && $c ne "\u$c"
	  and $str .= " || str[$d] == '\u$c'";

	while (ref $tree->{$_} eq 'HASH') {
	  my @nodes = keys %{$tree->{$_}};
	  last if @nodes > 1;
	  last if $nodes[0] eq '_value';
	  $tree = $tree->{$_};
	  $c = ($_ = $nodes[0]) ? chr : '\\0';
	  $str .= ")\n${indent}    && (str[${\++$d}] == '$c'";
	  $options->{'ignore-case'} && $c ne "\u$c"
	    and $str .= " || str[$d] == '\u$c'";
	}
	$str .= "))\n";
      }
    }

    # recurse into trie if node has children
    ref($tree->{$_}) eq 'HASH'
      and $str .= _build_trie($self, $tree->{$_}, $depth + 1, $options);

    # close case statement
    @nodes > 1
      and $str .= "${indent}  break;\n";
  }

  $str .= "${indent}}\n" if (@nodes > 1);

  return $str;
}

sub output {
  my ($self, $options) = @_;

  my $trie = _build_trie($self, $self->{'root'}, 1, $options);
  my ($data_type, $return_value) = $self->type_and_return;

  print {$options->{'output-fh'}} <<EOS;
#define MAX_KEY_LEN $self->{'max_key_len'}
#define MIN_KEY_LEN $self->{'min_key_len'}

$data_type
$options->{'function-name'} (char *str)
{
$trie
  return $return_value;
}
EOS
}

package Symtab::Hash;

=head2 Hash

This algorithms generates a Minimal Perfect Hashing Function (MPHF).
When this is possible, the function hashes m keys to m buckets with
no collisions. Not only is performance optimized, but no space is
wasted in the hash table.

Note that the hash function will always return unsigned values less
than m, even if the given key is not in the symbol table.  In such a
case, it is necessary to compare the key with the original.  If the
given key is assured to be in the symbol table, the comparison step is
no longer necessary and the original keys need not be preserved.

The MPHF algorithm in this implementation was developed by Bob
Jenkins.

 Pros: good time and space ratio
 Cons: not always possible

=cut

require DynaLoader;

use vars qw( @ISA );

BEGIN {
  @ISA = qw( Symtab DynaLoader );
}

bootstrap Symtab::Hash;

use constant USHRT_MAX  => 0xFFFF;
use constant USE_SCRAMBLE => 4096;

require DynaLoader;

sub insert {
  my ($self, $name, $value) = @_;

  $self->limits($name, $value);
  push @{$self->{'keys'}}, { name => $name, value => $value };
}

# set variables once
sub _init {
  my $self = shift;

  my $bits = log2(scalar @{$self->{'keys'}});
  $self->{'smax'} = 1 << $bits;

  # initialize scramble array
  for (my $i = 0; $i < USHRT_MAX; $i++) {
    $self->{'scramble'}[$i] = permute($i, $bits);
  }

  $self->{'alen'} = $self->{'blen'} = $self->{'smax'} / 2;
  $#{$self->{'tabb'}} = $self->{'blen'} - 1;
}

# run a hash function on the keys to get a and b values
sub _init_set {
  my ($self, $salt) = @_;

  my $nocollision = 1;
  my $amask = $self->{'alen'} - 1;
  my $bmask = 32 - log2($self->{'blen'});

  # determine pair (a,b)
  foreach my $key (@{$self->{'keys'}}) {
    my $hash = hash($salt, $key->{'name'});
    $key->{'a'} = ($self->{'alen'} > 1) ? ($hash & $amask) : 0;
    $key->{'b'} = ($self->{'blen'} > 1) ? ($hash >> $bmask) : 0;
  }

  # empty b table
  foreach my $tabb (@{$self->{'tabb'}}) {
    $tabb->{'list'} = [];
  }

  # two keys with the same pair (a,b) guarantee a collision
  foreach my $key (@{$self->{'keys'}}) {
    my $tabb = $self->{'tabb'}[$key->{'b'}];
    foreach my $other (@{$tabb->{'list'}}) {
      if ($key->{'a'} == $other->{'a'}) {
	$nocollision = 0;
	die if $key->{'name'} eq $other->{'name'};
      }
    }
    push @{$tabb->{'list'}}, $key;
  }

  return $nocollision;
}

# try to apply an augmenting list
sub _apply {
  my ($self, $q, $undo) = @_;

  my $tabh = $self->{'tabh'};

  # walk from child to parent
  my $parent;
  for (my $child = $q->[-1]; $child->{'parent'}; $child = $parent) {
    $parent = $child->{'parent'};

    # erase old hash values
    my $scramble = $parent->{'b'}{'val'};
    foreach my $key (@{$parent->{'b'}{'list'}}) {
      my $hash = $scramble ? $key->{'a'} ^ $scramble : $key->{'a'};

      if ($tabh->[$hash] && $key == $tabh->[$hash]) {
	undef $tabh->[$hash];
      }
    }

    # change parent->{'b'}{'val'}, which will change
    # the hashes of all parent siblings
    $parent->{'b'}{'val'} = $undo
      ? $child->{'old_val'}
      : $child->{'new_val'};

    $scramble = $parent->{'b'}{'val'};
    foreach my $key (@{$parent->{'b'}{'list'}}) {
      my $hash = $key->{'a'} ^ $scramble;

      if ($undo) {
	next unless $parent->{'parent'};
      }
      # could not apply augmenting path, undo changes
      elsif ($tabh->[$hash]) {
	_apply($self, $q, 1);
	return 0;
      }
      $tabh->[$hash] = $key;
    }
  }
  return 1;
}

# add item to the mapping
sub _augment {
  my ($self, $item, $highwater) = @_;

  my @q = ({ b => $item });
  my $tabb = $self->{'tabb'};
  my $tabh = $self->{'tabh'};
  my $limit = $self->{'smax'} - 1;

  # construct the spanning tree by walking the queue, add children to tail
  foreach my $q (@q) {
    SCRAMBLE: foreach my $scramble (@{$self->{'scramble'}}[0 .. $limit]) {
      my $b;

      foreach my $key (@{$q->{'b'}{'list'}}) {
	my $hash = $key->{'a'} ^ $scramble;

	# out of bounds
	$hash >= @{$self->{'keys'}}
	  and next SCRAMBLE;

	if (my $childkey = $tabh->[$hash]) {
	  my $hit = $tabb->[$childkey->{'b'}];

	  # hit at most one b
	  if ($b) {
	    $b != $hit
	      and next SCRAMBLE;
	  }
	  else {
	    $b = $hit;
	    $b->{'water'} == $highwater
	      and next SCRAMBLE;
	  }
	}
      }

      # add b to the queue of reachable vertices
      $b and $b->{'water'} = $highwater;
      push @q, {
	b       => $b,
	new_val => $scramble,
	old_val => $q->{'b'}{'val'},
	parent  => $q,
      };

      # apply augmenting path to determine if item has no collisions
      unless ($b) {
	_apply($self, \@q)
	  and return 1;
	pop @q;
      }
    }
  }
  return 0;
}

# find a mapping that makes this a perfect hash
sub _perfect {
  my $self = shift;

  # array of elements, indexed by hash value
  $self->{'tabh'} = [];

  # find the b list with the most keys
  my $maxkeys = 0;
  foreach my $tabb (@{$self->{'tabb'}}) {
    @{$tabb->{'list'}} > $maxkeys
      and $maxkeys = @{$tabb->{'list'}};
  }

  # in descending order by number of keys, map all b's
  foreach my $numkeys (reverse 1 .. $maxkeys) {
    foreach my $index (0 .. $#{$self->{'tabb'}}) {
      my $tabb = $self->{'tabb'}[$index];

      @{$tabb->{'list'}} == $numkeys
	and !_augment($self, $tabb, $index + 1)
	and return 0;
    }
  }

  return 1;
}

# try to find a perfect hash function
sub _find_hash {
  my $self = shift;

  _init($self);

  my $bad_init;
  for (my $salt = 1; $salt; $salt++) {
    # try to find a distinct (a,b) for all keys
    unless (_init_set($self, $salt)) {
      if (++$bad_init >= 2048) {
	if ($self->{'alen'} < $self->{'smax'} / 2) {
	  $self->{'alen'} *= 2;
	}
	elsif ($self->{'blen'} < $self->{'smax'}) {
	  $self->{'blen'} *= 2;
	  $self->{'blen'} > USE_SCRAMBLE
	    and die 'range too big for this implementation';
	  $#{$self->{'tabb'}} = $self->{'blen'} - 1;
	}
	else {
	  return;
	}
	$bad_init = 0;
      }
      next;
    }

    # given a distinct (a,b), attempt to build a perfect hash
    unless (_perfect($self)) {
      if ($self->{'blen'} < $self->{'smax'}) {
	$self->{'blen'} *= 2;
	$self->{'blen'} > USE_SCRAMBLE
	  and die 'range too big for this implementation';
	$#{$self->{'tabb'}} = $self->{'blen'} - 1;
	$salt--;
      }
      else {
	return;
      }
      next;
    }

    return $salt;
  }
}

sub output {
  my ($self, $options) = @_;

  my $salt = _find_hash($self);
  die 'could not find perfect hash' unless $salt;

  # list of elements ordered by hash value
  my @array = map { "\"$_->{'name'}\", $_->{'value'}" } @{$self->{'tabh'}};
  my ($data_type, $return_value) = $self->type_and_return;

  $" = "},\n  {";

  print {$options->{'output-fh'}} <<EOS;
#include <string.h>

#define MAX_KEY_LEN $self->{'max_key_len'}
#define MIN_KEY_LEN $self->{'min_key_len'}

#define GOLDEN_RATIO 0x9E3779B9
#define SALT         $salt

struct list {
  char * key;
  $data_type data;
} list[] = {
  {@array},
};
   
EOS

  my $amask = $self->{'alen'} - 1;
  my $bmask = 32 - log2($self->{'blen'});

  # create adjustment table for a
  my ($max, @line, @table);
  foreach my $tabb (@{$self->{'tabb'}}) {
    push @line, $tabb->{'val'} || 0;
    if ($tabb->{'val'} > $max) {
      $max = $tabb->{'val'};
    }
    if (@line == 16) {
      push @table, join(', ', @line);
      undef @line;
    }
  }
  push(@table, @line) if @line;
  my $tab_type = Symtab::get_int_type(0, $max);

  # ignore case, if applicable
  my $string = $options->{'ignore-case'}
    ? 'tolower(str[i])'
    : 'str[i]';

  $" = ",\n";

  print {$options->{'output-fh'}} <<EOS;
$tab_type tab[] = {
@table
};

$data_type
$options->{'function-name'} (char *str)
{
  register unsigned long hash;
  register int i, len;

  hash = SALT * GOLDEN_RATIO;
  len = strlen (str);
  for (i = 0; i < len; ++i)
    hash = (hash ^ $string) + ((hash<<26)+(hash>>6));
  hash = ((hash & $amask) ^ tab[hash >> $bmask]);

  if (hash < sizeof list / sizeof (struct list)
      && !strncmp (str, list[hash].key, len))
    return list[hash].data;

  return $return_value;
}
EOS
}

package main;

main();

=head1 AUTHOR

Marc Tardif <intmktg@cam.org>

=cut
