#!/usr/bin/perl

########################################################################
# vanityhash, a hex hash fragment creation tool
# Copyright (C) 2010 Ryan Finnie <ryan@finnie.org>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.
########################################################################

my $VERSION = '1.1';

use strict;
use warnings;
use Digest;
use Getopt::Long;
use Pod::Usage;
use Time::HiRes qw/time/;
use Socket;
use IO::Handle;
use IO::Select;
use POSIX ":sys_wait_h";

my $opt_find;
my $opt_bits = 24;
my $opt_findpos = 0;
my $opt_anypos;
my $opt_progressint = 5;
my $opt_workers = 1;
my $opt_digesttype = "md5";
my $opt_append;
my $opt_quiet;
my $opt_help;

Getopt::Long::Configure("bundling");
my($result) = GetOptions(
  'bits|b=i' => \$opt_bits,
  'position|p=i' => \$opt_findpos,
  'any-position|y' => \$opt_anypos,
  'progress|s=f' => \$opt_progressint,
  'workers|w=i' => \$opt_workers,
  'digest|d=s' => \$opt_digesttype,
  'append|a' => \$opt_append,
  'quiet|q' => \$opt_quiet,
  'help|?' => \$opt_help,
);

if((scalar @ARGV == 0) || $opt_help) {
  print STDERR "vanityhash version $VERSION\n";
  print STDERR "Copyright (C) 2010 Ryan Finnie <ryan\@finnie.org>\n";
  print STDERR "\n";
  pod2usage(2);
}

$opt_find = $ARGV[0];
unless($opt_find =~ /^[0-9A-Fa-f]+$/) {
  die("Pattern must be specified as hex digits.\n");
}

if(($opt_bits < 1) || ($opt_bits > 64)) {
  die("Search space must be between 1 and 64 bits, inclusive.\n");
}

my $packtype;
if($opt_bits > 32) {
  $packtype = 'Q';
  eval {
    my $testpack = pack($packtype, 0);
  };
  die("Search spaces greater than 32 bits are not supported on this platform.\n") if $@;
} elsif($opt_bits > 16) {
  $packtype = 'L';
  eval {
    my $testpack = pack($packtype, 0);
  };
  # Running this in real mode DOS, perhaps?
  die("Search spaces greater than 16 bits are not supported on this platform.\n") if $@;
} elsif($opt_bits > 8) {
  $packtype = 'S';
  eval {
    my $testpack = pack($packtype, 0);
  };
  # Apple II?
  die("Search spaces greater than 8 bits are not supported on this platform.\n") if $@;
} else {
  $packtype = 'C';
}

# This is an arbitrary restriction to protect users from themselves and 
# exhaust resources.  If you happen to have a system with more than 64 
# hardware threads, manually disable this check.  Also, could I borrow 
# the system?
if(($opt_workers < 1) || ($opt_workers > 64)) {
  die("Workers must be between 1 and 64, inclusive.\n");
}

if($opt_findpos < 0) {
  die("Pattern position must be 0 or greater.\n");
}

# All digest operations are lowercase.
$opt_find = lc($opt_find);

# Allowed: md2 md4 md5 sha1alt sha* crc*
my(@digestarray, $digestdisp);
if($opt_digesttype eq "md5") {
  $digestdisp = "MD5";
  @digestarray = ("MD5");
} elsif($opt_digesttype eq "md4") {
  $digestdisp = "MD4";
  @digestarray = ("MD4");
} elsif($opt_digesttype eq "md2") {
  $digestdisp = "MD2";
  @digestarray = ("MD2");
} elsif($opt_digesttype eq "sha1alt") {
  $digestdisp = "SHA1";
  @digestarray = ("SHA1");
} elsif($opt_digesttype =~ /^sha(\d+)$/) {
  $digestdisp = "SHA$1";
  @digestarray = ("SHA", $1);
} elsif($opt_digesttype =~ /^crc(.*?)$/) {
  $digestdisp = uc("CRC$1");
  @digestarray = ("CRC", (type => lc("crc$1")));
} else {
  die(sprintf("Unknown digest type: %s\n", $opt_digesttype));
}

my $findlen = length($opt_find);
my $every = 100000 * $opt_workers;

# Create the initial context, and populate with the input.  Note: 
# calculating the hash is a destructive act, so any hash calculations 
# must be done against a clone of this digest.
my $ctx = Digest->new(@digestarray) || die "Cannot create digest context: $!";
print STDERR "Reading input data and adding to digest..." unless $opt_quiet;
my $datalen = 0;
binmode(STDIN);
while (<STDIN>) {
  print $_ if($opt_append);
  $datalen += length($_);
  $ctx->add($_);
}
print STDERR "done.\n" unless $opt_quiet;

my $origdigest = $ctx->clone->hexdigest;
printf STDERR "Original data: %d bytes, %s %s\n", $datalen, $digestdisp, $origdigest unless $opt_quiet;

# Figure out the maximum (character) length of the original hash, and 
# make sure the user-supplied options aren't overrunning this.
if($opt_findpos > (length($origdigest) - $findlen)) {
  die(sprintf("Pattern position %d goes beyond end of %s digest, maximum supported is %d.\n", $opt_findpos, $digestdisp, (length($origdigest) - $findlen)));
}

if($opt_anypos) {
  printf STDERR "Searching for %s at any position in a %d-bit space.\n", $opt_find, $opt_bits unless $opt_quiet;
} else {
  printf STDERR "Searching for %s at position %d in a %d-bit space.\n", $opt_find, $opt_findpos, $opt_bits unless $opt_quiet;
}

my(%children) = ();
my(@childsocks) = ();
my $start = time;
my($iosel) = IO::Select->new();

# These need to be global because of the child INT handler.
# They are effectively per-child.
my($childi, $childt);
my($parentsock, $childsock);

printf STDERR "Spawning %d worker%s... ", $opt_workers, ($opt_workers == 1 ? "" : "s") unless $opt_quiet;
for($childt = 0; $childt < $opt_workers; $childt++) {
  $childsock = undef;
  $parentsock = undef;
  # Create a socket pair for parent/child communications.
  socketpair($childsock, $parentsock, AF_UNIX, SOCK_STREAM, PF_UNSPEC) or die "socketpair: $!";
  $childsock->autoflush(1);
  $parentsock->autoflush(1);
  push(@childsocks, $childsock);

  my($pid);
  if(!defined($pid = fork())) {
    # Something bad happened.
    die "Cannot fork worker $childt: $!";
  } elsif($pid == 0) {
    # THIS BLOCK RUNS THE FORKED CHILD PROCESS
    # Set up a INT handler for premature termination.
    $SIG{INT} = \&childsigint;

    # Since multiple children are forked, @childsocks is filled up with 
    # sockets of all previous child sockets.  Since we don't need any 
    # of them (just a socket to the parent), close them all.
    foreach my $csock (@childsocks) {
      close($csock);
    }
    @childsocks = ();

    my $nextprogress = $childt + $every;
    for($childi = $childt; $childi < (2**$opt_bits-1); $childi += $opt_workers) {
      if($childi >= $nextprogress) {
        $nextprogress = $childi + $every;
        printf $parentsock "%d\t%d\tPROGRESS\t%d\n", $childt, $$, (($childi - $childt) / $opt_workers)+1;
      }

      # Add the test data and determine the hash.
      my $testctx = $ctx->clone;
      $testctx->add(pack($packtype, $childi));
      my $testdigest = $testctx->hexdigest;
      if((substr($testdigest, $opt_findpos, $findlen) eq $opt_find) || ($opt_anypos && (index($testdigest, $opt_find) > -1))) {
        printf $parentsock "%d\t%d\tFOUND\t%s\t%d\n", $childt, $$, $testdigest, $childi;
      }
    }
    printf $parentsock "%d\t%d\tPROGRESS\t%d\n", $childt, $$, (((2**$opt_bits-1) - $childt) / $opt_workers)+1;
    exit(0);

  } else {
    # THIS BLOCK IS A CONTINUATION OF THE MASTER PROCESS
    # The parent doesn't need the parent portion of the socket.
    close($parentsock);
    # Add to IO::Select object for monitoring responses from children.
    $iosel->add($childsock);
    $children{$pid} = $childt;
  }
}
print STDERR "done.\n" unless $opt_quiet;

$SIG{INT} = \&parentsigint;

my $printedappend = 0;
my $matchesfound = 0;
my $nextprogress = 0;
if($opt_progressint > 0) {
  $nextprogress = time + $opt_progressint;
}

# Total number of processed hashes from all children.
my %totalis = ();

while((scalar keys %children) > 0) {
  # REAPER CODE

  # Constantly loop until one of our children dies.
  my($pid) = 0;
  $pid = waitpid(-1, WNOHANG);
  if($pid > 0) {
    my($exitstatus) = $? >> 8;
    my $t = $children{$pid};
    delete($children{$pid});
    unless($exitstatus == 0) {
      die(sprintf("Worker %d died with exit status %d.\n", $t, $exitstatus));
    }
  }


  my(@canread) = $iosel->can_read(($opt_progressint > 0) ? ($nextprogress - time) : 1);
  foreach my $sock (@canread) {
    # Read the message from the child.
    my $in = <$sock>;
    next unless $in;
    chomp $in;
    #printf STDERR "Received: %s\n", $in unless $opt_quiet;
    my($msgt, $msgpid, $msgcmd, $msgrest) = split(/\t/, $in, 4);
    if($msgcmd eq "PROGRESS") {
      # Progress indicator
      my $processed = $msgrest;
      $totalis{$msgt} = $processed;
    } elsif($msgcmd eq "FOUND") {
      # Match found
      my($msgdigest, $msgdata) = split(/\t/, $msgrest, 2);
      $msgdata = pack($packtype, $msgdata);
      printf STDERR "Match found: 0x%*v02x -> %s %s\n", '', $msgdata, $digestdisp, $msgdigest unless $opt_quiet;
      $matchesfound++;
      if($opt_append) {
        # It's possible children may send back positive matches by the time
        # we're done killing them, but we only want to output a positive
        # match once while in append mode.
        if(!$printedappend) {
          # Output the binary data, appending to the end of the original.
          print $msgdata;
          $printedappend = 1;
          # We won't need the workers anymore.
          kill(POSIX::SIGINT(), keys %children);
        }
      } else {
        # Print a machine readable match line.
        printf "%*v02x %s\n", '', $msgdata, $msgdigest;
      }
    }
  }

  # Occasionally print a human-readable status line.
  my $now = time;
  if(($opt_progressint > 0) && ($now >= $nextprogress)) {
    my $totali = 0;
    for(my $i = 0; $i < $opt_workers; $i++) {
      $totali += $totalis{$i} if($totalis{$i});
    }
    my $elapsed = $now - $start;
    my $percent = ($totali / (2**$opt_bits-1)) * 100;
    my $remaining = 0;
    if($totali > 0) {
      $remaining = ((2**$opt_bits-1) - $totali) / ($totali / $elapsed);
      printf STDERR "%3d%% searched, ~%02d:%02d remaining...\n", $percent, ($remaining / 60), ($remaining % 60) unless $opt_quiet;
    } else {
      printf STDERR "%3d%% searched...\n", $percent unless $opt_quiet;
    }
    $nextprogress = $now + $opt_progressint;
  }

}

# We're done with a full space search!
my $end = time;
my $totaldelta = $end - $start;

# Calculate final totals from the children.
my $totali = 0;
for(my $i = 0; $i < $opt_workers; $i++) {
  $totali += $totalis{$i} if($totalis{$i});
}

printf STDERR "Search finished in %02d:%02d, %d match%s found in %d%% of a %d-bit space.\n",
  ($totaldelta / 60),
  ($totaldelta % 60),
  $matchesfound,
  ($matchesfound == 1 ? "" : "es"),
  ($totali / (2**$opt_bits-1) * 100),
  $opt_bits
unless $opt_quiet;

exit(0);

########################################
# Subs
########################################

sub parentsigint {
  print STDERR "User interrupt, cleaning up.\n" unless $opt_quiet;
  kill(POSIX::SIGINT(), keys %children);
}

sub childsigint {
  printf $parentsock "%d\t%d\tPROGRESS\t%d\n", $childt, $$, (($childi - $childt) / $opt_workers)+1;
  exit(0);
}

########################################
# Manpage
########################################

=head1 NAME

vanityhash - A hex hash fragment creation tool

=head1 SYNOPSIS

B<vanityhash> S<[ B<options> ]> hexfragment < inputfile

B<vanityhash> B<--append> S<[ B<options> ]> hexfragment < inputfile > outputfile

=head1 DESCRIPTION

B<vanityhash> is a tool that can discover data to be added to the end 
of a file to produce a desired hex hash fragment.  It searches a 
message space and runs a hashing algorithm against the original data 
plus the appended data to determine if the desired hash fragment is 
present.  vanityhash can run multiple parallel workers to effectively 
make use of multiple processors/cores/threads, and supports multiple 
hash digest types (MD5, SHA1, SHA256, etc).

vanityhash can be used to append data to files that are capable of 
ignoring garbage data at the end of the file (such as ISO images and 
some types of graphic images), in order to produce a "vanity" hash.  
vanityhash is fast, as it only reads the base input data once, and then 
reverts back to that base state over and over while it permeates the 
search space, rather than hashing the entire source during each 
permeation.

vanityhash operates on the concept of a "search space".  For example, 
given a 24-bit search space, vanityhash will run from 0x00000000 to 
0x00ffffff, append the 4-byte packed version of each number to the end 
of the input data, calculate the resulting hash, and search the hash 
value for the desired hex fragment pattern.  A desired hex fragment can 
be larger than the search space (for example, searching for "abcdef" in 
a 16-bit search space), but the chances of finding a match reduce 
drastically the larger the desired hex fragment is.

In its default operation, vanityhash will search the entire specified 
search space and output all matching results to STDOUT, one result per 
line, in the form "extradata hash", where both "extradata" and "hash" 
are in hex form.  When the B<--append> option is specified, this 
behavior changes.  If a match is found, the original input data plus 
the extra data (in byte form) are outputted, and searching ends after 
the first successful match.  If no matches are found, the original data 
only is outputted.

=head1 OPTIONS

=over

=item B<-b> I<bits>, B<--bits>=I<bits>

Space to be searched, in bits.  Allowed values range from 1 to 64.  
Default is 24.  Search spaces larger than 32 bits require a 64-bit 
operating system, and a version of Perl compiled with 64-bit integer 
support.

=item B<-p> I<position>, B<--position>=I<position>

The position within the hex hash to look for the desired fragment, in 
hex digits.  The beginning starts at 0.  Default is 0.  A position that 
extends the fragment beyond the length of the hash is not allowed.

=item B<-y>, B<--any-position>

When enabled, this option will override B<--position> and will return 
hashes that contain the desired fragment in any position within the 
hash.

=item B<-s> I<seconds>, B<--progress>=I<seconds>

The number of seconds between printing of progress lines, default 5 
seconds.  A decimal value may be specified.  A value of 0 disabled 
printing progress lines.

=item B<-w> I<workers>, B<--workers>=I<workers>

The number of workers to be spawned.  Default is 1.  Recommended value 
is the number of logical processors on the running system.

=item B<-d> I<digesttype>, B<--digest>=I<digesttype>

The hashing digest type to use.  Default is "md5" Allowed values are 
"md2", "md4", "md5", and "shaI<N>" where I<N> is a valid SHA digest 
value.  "sha1alt" is accepted to use Digest::SHA1 instead of 
Digest::SHA.  Note that for many digest types, the appropriate Perl 
module must be installed and available.

=item B<-a>, B<--append>

When enabled, the original data is outputted back to STDOUT.  Then, 
when/if the first matching hash is found, the data fragment used to 
produce the matching hash is outputted to STDOUT.  STDOUT can then be 
redirected to another file to produce the modified file.

=item B<-q>, B<--quiet>

Normally vanityhash sends a fair amount of status information to STDERR 
during operation.  When enabled, all non-error status information is 
instead suppressed.

=item B<-?>, B<--help>

Print a synposis and exit.

=back

=head1 BUGS / LIMITATIONS

Search spaces larger than 32 bits require a 64-bit operating system, 
and a version of Perl compiled with 64-bit integer support.

A block of computed data is added equal to the size of the integer type 
the search space fits into (1 byte for 8 bits or less, 2 bytes for 9 
through 16 bits, 4 bytes for 17 through 32 bits, 8 bytes for 33 through 
64 bits), even if the search space could fit into a smaller raw byte 
block (say, 3 bytes for a 20-bit search space).  While this does not 
reduce (or increase) the possibility of finding a match in a given 
search space, the extra null byte(s) in the block are technically 
wasteful.

Extra bytes are packed according to system endianness.  Thus, search 
results will be different between big and little endian systems.

vanityhash should work fine on any POSIX operating system, and has been 
tested on Linux and Mac OS X.  It mostly works with Strawberry Perl for 
Windows, but crashes at the end.  Suggestions to fix this would be 
welcomed.

=head1 CREDITS

B<vanityhash> was written by Ryan Finnie <ryan@finnie.org>.  vanityhash 
was inspired by Seth David Schoen's 2003 program, hash_search.
