#!/usr/bin/perl -w

use strict;
use warnings;
use Benchmark;
use Tie::RangeHash;

$| = 1;

print <<'EOF';
#
# rangefind.pl - Benchmark various search methods
#
# Info: http://baetzler.de/perl/rangefind.html
#
# Questions, comments, suggestions? Email thomas@baetzler.de
#
EOF


#
# The Problem:
#
# Find an efficient way to determine if a number x is inside one
# of a number of intervals.
#


#
# The start/end arrays will hold the bounds of the intervals,
# while @tests will hold the numbers we test with. They must be
# declared as "our" since they must be visible inside an eval
# statement (i.e. in the benchmark).
#

our @start;
our @end;
our @tests;
our %rhash;

# set $mode below to 'verify' if you want to verify the results
# that the various search methods return.
my $mode = 'benchmark';

#
# The most straightforward approach is to use a linear interval
# search, where each interval is tried in turn until either a
# match is found or the list of intervals is exhausted.
#
# The advantage of this method is that it is easy to code and
# that the input intervals could be unsorted. The drawback is
# of course that it must check the whole array to verify that
# a test case is outside of all intervals.
#

sub linsearch {
  my $num = shift;

  # interate over all intervals
  for( my $i = 0; $i < @start; $i++ ){
    # return 1 if number is inside the current interval
    if( $start[$i] <= $num && $num <= $end[$i] ){
      return( $i );
    }
  }

  # list exhausted, return -1
  return( -1 );
}



#
# Since we can presort our intervals easily, using a binary
# search sounds like a great option.
#
# Here we use a recursive algorithm that is slightly easier
# to code than the iterative variant. Since we only pass
# numeric values, argument passing during subroutine invocation
# is not that much of an issue.
#
# To recap the algorithm:
#
# find( start_index, end_index )
#   - calculate middle point between start and end
#   - bingo if this is what we were looking for
#   - otherwise repeat search for either upper or lower half
#
# Even though a binary search is a stock tool of the trade for
# computer scientists, we always manage to fit the odd "off by
# one" bug into the partitioning scheme.
#

sub binsearch_recursion {
  my( $lo, $hi, $num ) = @_;

  # no intervals to search, so just report a miss
  return -1 if $lo > $hi;

  # no recursion necessary if we have one interval to check.
  if( $lo == $hi ){
    if( $start[$lo] <= $num && $num <= $end[$lo] ){
      return( $lo );
    } else {
      return( -1 );
    }
  }

  # compute "median" point between start and end indices.
  my $mid = $lo + int(($hi - $lo) / 2);

  # check if the num is in the median interval
  if( $start[$mid] <= $num && $num <= $end[$mid] ){
    # yes, we have a winner!
    return( $mid );
  } else {
    # no, so check wether that number is smaller that the start
    # of our median interval
    if( $start[$mid] > $num ){
      # yes, so we repeat the process in the lower half of the
      # current range.
      return binsearch_recursion( $lo, $mid -1, $num );
    } else {
      # no, so we repeat the process in the upper half of the
      # current range.
      return binsearch_recursion( $mid + 1, $hi, $num );
    }
  }

}

#
# this is just a wrapper sub that gives our binary search the
# same interface as the linear search.
#

sub binsearch {
  my $num = shift;

  return binsearch_recursion( 0, $#end, $num );
}


#
# Another approach would be to make our binary search iterative
# instead of recursive. It may be a little bit harder to code
# this way, but during run-time we save plenty of time by not
# having to call a subroutine.
#

sub binsearch_iterative {
  my $num = shift;

  # bail out if number is beyond the range of our intervals.
  return -1 if $num < $start[0] || $end[-1] < $num;

  # initialize start and end pointers
  my ( $lo, $hi ) = ( 0, $#end );

  while ( $lo <= $hi ) {

    # determine new division point
    my $mid = $lo + int(($hi - $lo) / 2);

    if( $start[$mid] <= $num && $num <= $end[$mid] ){
      # return the index of the interval that contains $num
      return $mid;
    } else {
      # adjust start or end pointer
      if( $num < $start[$mid] ){
        $hi = $mid - 1;
      } else {
        $lo = $mid + 1;
      }
    }
  }

  # we didn't find a suitable interval
  return -1;
}


#
# Here's a different approach to binary searching the intervals
# for the number: We dynmically create a subroutine that is a
# completely unrolled binary search over all arrays using nested
# if statements.
#
# This requires that we first create a text representation of our
# code which is then eval()'ed to create the subroutine. This
# adds some startup overhead, but it is also significantly faster
# than the other methods.
#
# The code is created using a recursive subroutine similar in
# structure to the recursive binary search shown above.
#
# Most of the "complexity" here is added by the design goal to
# create properly indented code that can easily be inspected
# visually.
#
# The major drawback here is that we would have to recreate the
# search tree every time an interval is added or removed.
#

sub create_bst_recursive {
  my( $lo, $hi, $iter ) = @_;

  # we use the $iter depth marker specifically to create a nice
  # indentation using the string repeat operator 'x'.

  # further subdivision is neither necessary nor possible
  return "  "x$iter . "  return -1;\n" if $lo > $hi;

  # no recursion necessary if we have just 1 interval to check.
  if( $lo == $hi ){

    return "  "x$iter .
           "if( $start[$lo] <= \$num && \$num <= $end[$lo] ){\n" .
           "  "x$iter . "  return( $lo );\n" .
           "  "x$iter . "} else {\n" .
           "  "x$iter . "  return( -1 );\n" .
           "  "x$iter . "}\n";
  }

  # compute "median" point between start and end indices.
  my $mid = $lo + int(($hi - $lo) / 2);

  # write code to handle hits, misses and use recursion to
  # fill out "upper" and "lower" branches of the search tree.
  return "  "x$iter .
         "if( $start[$mid] <= \$num\n" . "  "x$iter .
         " "x23 . "&& \$num <= $end[$mid] ){\n" .
         "  "x$iter . "  return( $mid );\n" .
         "  "x$iter . "} else {\n" .
         "  "x$iter . "  if( $start[$mid] > \$num ){\n" .
         create_bst_recursive( $lo, $mid - 1, $iter+2) .
         "  "x$iter . "  } else {\n" .
         create_bst_recursive( $mid + 1, $hi, $iter+2) .
         "  "x$iter . "  }\n" .
         "  "x$iter . "}\n";
}

#
# This subroutine is the stub that contains all of the "unique"
# code parts of the subroutine constructs. It calls the recursive
# sub defined above to fill in the search tree. After that is
# done, it evaluates the code it created to add the subroutine
# dynamically to the program code.
#
# The die() statement at the end of the sub isn't strictly
# necessary but a nice early warning if something is wrong with
# the code we created.
#
# Please note that the code snippet ends with a 1; statement -
# this is since eval() always returns the value of the last
# statement.
#

sub setup_binary_search_tree {

  # Just the wrapper for the search tree:
  my $code = "sub unrolled_binsearch {\n" .
             "  my \$num = shift;\n\n" .
             create_bst_recursive( 0, $#end, 1 ) .
             "\n  die \"Error in unrolled_binsearch():" .
             "unhandled value \$num\"\n}\n\n" .
             "1;\n";

  # print $code for visual inspection
  # uncomment these two lines if you want to see the tree
  #print "\n===[dynamically created code]===\n\n",
  #      "$code================================\n\n";

  # wipe previous function definition so we are not warned
  # about a redefinition of this function
  undef &unrolled_binsearch;

  # eval the code we just created to instantiate our subroutine
  eval( $code ) or die "eval() failed: $@";
}


#
# Test the code by running a few benchmarks
#

foreach my $numintervals ( 1, 2, 5, 10, 20, 50, 100, 200, 500 ){

  my $testcases = 10000;

  # don't forget to clear the intervals at the beginning
  # of each test or we might have a big problem :-)
  @start = ();
  @end = ();

  #
  # Also compare my implementations to an available product
  # i.e. Tie::RangeHash from CPAN.
  #

  # Set up the Tie::RangeHash object
  tie %rhash, 'Tie::RangeHash', {
    Type => Tie::RangeHash::TYPE_NUMBER
  };

  # create evenly spaced intervals 5 numbers in width,
  # so each number ending in 0,1,2,3 or 4 is "in", and
  # each number ending in 5,6,7,8 or 9 is "out".

  for( my( $i, $j ) = ( 0, 4);
                   $i < 10 * $numintervals; $i += 10, $j+= 10 ){
    push @start, $i;
    push @end, $j;
    $rhash{"$i,$j"} = int( $i / 10 );
  }

  # the unrolled binary search needs to be set up
  # for the particulary intervals it's supposed to
  # sort.

  setup_binary_search_tree();

  # create a bunch of test numbers
  for( my $i = 0; $i < $testcases; $i++ ){
    push @tests, int(rand(10 * $numintervals));
  }


  if( $mode eq 'verify' ){

    print "\nVerification for $numintervals intervals\n";

    for( my $i = 0; $i < 10 * $numintervals; $i++ ){
      my $tf = ( $i % 10 ) < 5 ? int( $i / 10 ) : -1;
      die "linsearch() failed verify with $i!\n"
         if $tf != linsearch($i);
      die "binsearch() failed verify with $i!\n"
         if $tf != binsearch($i);
      die "binsearch_iterative() failed verify with $i!\n"
          if $tf != binsearch_iterative($i);
      die "unrolled_binsearch() failed verify with $i!\n"
          if $tf != unrolled_binsearch($i);
      die "Tie::RangeHash failed verify with $i!\n"
          if $tf != ( defined($rhash{$i})  ? $rhash{$i} : -1 );
    }

  } else {

    print "\n$testcases tests with $numintervals intervals\n\n";

    # run a number of benchmarks

    timethese 10, {
      'Linear Search' =>
          'foreach my $test ( @tests ){ linsearch($test); }',
      'Recursive Binary Search' =>
          'foreach my $test ( @tests ){ binsearch($test); }',
      'Iterative Binary Search' =>
          'foreach my $test ( @tests ){ binsearch_iterative($test); }',
      'Unrolled Binary Search' =>
          'foreach my $test ( @tests ){ unrolled_binsearch($test); }',
      'Tie::RangeHash' =>
          'foreach my $test ( @tests ){
             defined( $rhash{$test} ) ? 1 : 0;
            }'
    };
  }
}


