
# Time-stamp: "2004-12-29 22:45:42 AST" -*-Perl-*-
### The POD is at the end. ###
##########################################################
#
# Lifted from Games::Dissociate.pm.
#
##########################################################

require 5.000;
package JFH::Dissociate;
use strict;
require Exporter;
use Data::Random qw(rand_set);
use Text::Wrap;

use vars qw(@ISA @EXPORT @EXPORT_OK $Debug $VERSION);
use Carp;
@ISA = qw(Exporter);
@EXPORT = qw(dissociate dissociate_glob);
$VERSION = 0.15;
$Debug = 0;

###########################################################################

sub dissociate_glob{
  my $glob=shift;
  my $degree=shift;
  my $maxpara=shift;

  Text::Wrap::wrap('','',dissociate(read_glob_into_string($glob),$degree,$maxpara));
}

sub read_glob_into_string{
  my $glob=shift;
  my @candidates=glob($glob);
  my $min=60;
  my $max=120;
  my $s;

  my @files = rand_set(set => \@candidates,
		       min=> $min < scalar @candidates? $min: scalar @candidates,
		       max=> $max < scalar @candidates? $max: scalar @candidates);


  foreach (@files) {
    next unless `file "$_" |grep text`;

    $s .= JFH::Dissociate::read_message_into_string($_);
  }
  $s;
}

sub read_message_into_string {
  my $f=shift;
  open(FILE, "<$f") or die "Oops!  Couldn't open $f.";

  my $str=join('',<FILE>);

  #remove headers from beginning of file.  Note: Headers can be
  #multiple line, but if so, subsequent lines start with spaces.
  $str=~s/^From .*\n//gs;
  $str=~s/^[[:space:]]*([A-Z][[:^space:]]*:[^\n]*\n([ \t]+[[:^space:]][^\n]*\n)*)*//gs;

  # Remove quoted words, including those that are wrapped to the next
  # line, but shouldn't be.
  $str=~s/^>[^\n]*\n(?:[^>][^\n]*\n>[^\n]*\n)*//gsmx;

  #Remove boxquotes
  $str=~s/^,----.*?`----//gsm;
  
  # Remove attributions
  $str=~s/.*(wrote|writes)(:| in message.*)//g;

  #Remove <deleted> tags
  $str =~ s/<deleted>[[:space:]]*//g;

  #Skip signature stuff
  $str=~s/^(James Harris[[:space:]]*|URL:.*)$//gsm;
  $str=~s/^--( |=20)\n.*//gsm;

  #Remove my own comments.
  $str=~s/\[\[[^\]]*\]\]//gs;
  $str=~s/\[\.+\]//gs;

  #Remove footnotes.
  $str=~s/^[[:space:]]*\[[0-9]*\]:.*\n//gsm;
  $str=~s/^Footnotes://gsm;
  
  # Remove singlenewlines.
  $str=~s/^[[:space:]]*\n/\n/gsm;
  $str=~s/([^\n])\n([^\n])/$1 $2/g;

  # Never more than two newlines in a row.
  $str=~s/\n\n+/\n\n/g;

  $str;
}


#==========================================================================
sub dissociate {

  my $in = $_[0];
  my $degree = int($_[1]) || 2;
  my $max_paras = $_[2] || 100;
  my @out;

  my $min_iterations = $max_paras * 20 / $degree;
  my $max_iterations = $max_paras * 60 / $degree;

  $degree = 2 if $degree == 1;

  use locale;

  die "No input\n" unless length $in;
  study $in;

  my $new_matcher;
  $new_matcher = "[.!?][[:space:]]+(" .
    join("[[:space:]]+", ("[[:^space:]]+") x $degree)
      . "[[:space:]]+)";

  # In use in the loop.
  my($re, @orig, $matched,
     $i, $last_matched, $iteration, $paranum);

  $paranum = $iteration = 0;

  # Quit if we get ($max_paras paragraphs *AND* enough iterations)
  # *OR* too many iterations.

  while ((($paranum < $max_paras) || ($iteration < $min_iterations)) &&
	 ($iteration < $max_iterations)){

    ++$iteration;

    if($last_matched) { # last thing we matched -- '' means take a stab

      @orig = map(quotemeta($_), $last_matched =~ m/([[:^space:]]+)/sg );

      $re =  join("\\s+",  @orig) # overlap
	. "\\s+("
	  . join("\\s+", ("[[:^space:]]+") x  $degree) # new tokens
	    . "[[:space:]]+)"
	      ;

      $matched = '';

      my @matches = ($in =~ m/$re/gs);

      if (scalar @matches == 0) { # The original Dissociate.pm says
	# that if it don't work, do it
	# twice.  It's magic!
	@matches = ($in =~ m/$re/gs);

	if (scalar @matches == 0) { # Maybe the only match is at end of file!
	                            # Our last hope!
	  $re =  join("\\s+",  @orig) # overlap
	    . "\\s+([[:^space:]]+\\s*)*";
	  @matches = ($in =~ m/$re/gs);
	}
      }

      print "Matches for $last_matched: ".scalar @matches."\n" if $Debug;
# 	foreach (@matches) {
# 	  print "List match: ($_)\n" if $Debug;
# 	}

	if (scalar @matches > 0) {
	  my @m = rand_set(set => \@matches,
			   size => 1);
          $matched = $m[0];

        }
	else {
	 print "Oh shit.  No matches!******\n".@matches;
	}


      print "Matched ($last_matched) with ($matched).\n" if $Debug;
      $last_matched = $matched;
      push @out, $last_matched;
      if ( $last_matched =~ /\n/ ) {
	++$paranum;
      }
      next;

    } else {
      # We don't have a last_matched -- take a stab.
      my($frame, $frame_size);
      pos($in) = 0;  # Ever necessary?
      $frame_size =  ($degree + 3) * 8;

      my $i = int(rand(length($in) - $frame_size));
      pos($in) = $i;
      print "Taking a stab at pos $i\n" if $Debug;
      if(   $in =~ m/$new_matcher/sg 
         || $in =~ m/$new_matcher/sg )  # Yes, try TWICE!  Magic, wooo.
      {
        $last_matched = $1;
	print "$last_matched matches $new_matcher.\n" if $Debug;
	push @out, $last_matched;
      } else {
        print "Can't get an initial $degree-token match" if $Debug;
        last;
      }
    }

  } # end while
  my $ret = join('', @out);

  #Remove everything after last sentence-closing punctuation.
  $ret =~ s/([.?!])[^.?!]*$/$1\n/s;

  # Never more than two newlines in a row.
  $ret=~s/\n\n+/\n\n/g;

  return $ret;
}

#==========================================================================
1;


