# Time-stamp: "2004-12-29 22:45:42 AST" -*-Perl-*- ### The POD is at the end. ### ########################################################## # # Lifted from Games::Dissociate.pm. # ########################################################## require 5.000; package JFH::Dissociate; use strict; require Exporter; use Data::Random qw(rand_set); use Text::Wrap; use vars qw(@ISA @EXPORT @EXPORT_OK $Debug $VERSION); use Carp; @ISA = qw(Exporter); @EXPORT = qw(dissociate dissociate_glob); $VERSION = 0.15; $Debug = 0; ########################################################################### sub dissociate_glob{ my $glob=shift; my $degree=shift; my $maxpara=shift; Text::Wrap::wrap('','',dissociate(read_glob_into_string($glob),$degree,$maxpara)); } sub read_glob_into_string{ my $glob=shift; my @candidates=glob($glob); my $min=60; my $max=120; my $s; my @files = rand_set(set => \@candidates, min=> $min < scalar @candidates? $min: scalar @candidates, max=> $max < scalar @candidates? $max: scalar @candidates); foreach (@files) { next unless `file "$_" |grep text`; $s .= JFH::Dissociate::read_message_into_string($_); } $s; } sub read_message_into_string { my $f=shift; open(FILE, "<$f") or die "Oops! Couldn't open $f."; my $str=join('',); #remove headers from beginning of file. Note: Headers can be #multiple line, but if so, subsequent lines start with spaces. $str=~s/^From .*\n//gs; $str=~s/^[[:space:]]*([A-Z][[:^space:]]*:[^\n]*\n([ \t]+[[:^space:]][^\n]*\n)*)*//gs; # Remove quoted words, including those that are wrapped to the next # line, but shouldn't be. $str=~s/^>[^\n]*\n(?:[^>][^\n]*\n>[^\n]*\n)*//gsmx; #Remove boxquotes $str=~s/^,----.*?`----//gsm; # Remove attributions $str=~s/.*(wrote|writes)(:| in message.*)//g; #Remove tags $str =~ s/[[:space:]]*//g; #Skip signature stuff $str=~s/^(James Harris[[:space:]]*|URL:.*)$//gsm; $str=~s/^--( |=20)\n.*//gsm; #Remove my own comments. $str=~s/\[\[[^\]]*\]\]//gs; $str=~s/\[\.+\]//gs; #Remove footnotes. $str=~s/^[[:space:]]*\[[0-9]*\]:.*\n//gsm; $str=~s/^Footnotes://gsm; # Remove singlenewlines. $str=~s/^[[:space:]]*\n/\n/gsm; $str=~s/([^\n])\n([^\n])/$1 $2/g; # Never more than two newlines in a row. $str=~s/\n\n+/\n\n/g; $str; } #========================================================================== sub dissociate { my $in = $_[0]; my $degree = int($_[1]) || 2; my $max_paras = $_[2] || 100; my @out; my $min_iterations = $max_paras * 20 / $degree; my $max_iterations = $max_paras * 60 / $degree; $degree = 2 if $degree == 1; use locale; die "No input\n" unless length $in; study $in; my $new_matcher; $new_matcher = "[.!?][[:space:]]+(" . join("[[:space:]]+", ("[[:^space:]]+") x $degree) . "[[:space:]]+)"; # In use in the loop. my($re, @orig, $matched, $i, $last_matched, $iteration, $paranum); $paranum = $iteration = 0; # Quit if we get ($max_paras paragraphs *AND* enough iterations) # *OR* too many iterations. while ((($paranum < $max_paras) || ($iteration < $min_iterations)) && ($iteration < $max_iterations)){ ++$iteration; if($last_matched) { # last thing we matched -- '' means take a stab @orig = map(quotemeta($_), $last_matched =~ m/([[:^space:]]+)/sg ); $re = join("\\s+", @orig) # overlap . "\\s+(" . join("\\s+", ("[[:^space:]]+") x $degree) # new tokens . "[[:space:]]+)" ; $matched = ''; my @matches = ($in =~ m/$re/gs); if (scalar @matches == 0) { # The original Dissociate.pm says # that if it don't work, do it # twice. It's magic! @matches = ($in =~ m/$re/gs); if (scalar @matches == 0) { # Maybe the only match is at end of file! # Our last hope! $re = join("\\s+", @orig) # overlap . "\\s+([[:^space:]]+\\s*)*"; @matches = ($in =~ m/$re/gs); } } print "Matches for $last_matched: ".scalar @matches."\n" if $Debug; # foreach (@matches) { # print "List match: ($_)\n" if $Debug; # } if (scalar @matches > 0) { my @m = rand_set(set => \@matches, size => 1); $matched = $m[0]; } else { print "Oh shit. No matches!******\n".@matches; } print "Matched ($last_matched) with ($matched).\n" if $Debug; $last_matched = $matched; push @out, $last_matched; if ( $last_matched =~ /\n/ ) { ++$paranum; } next; } else { # We don't have a last_matched -- take a stab. my($frame, $frame_size); pos($in) = 0; # Ever necessary? $frame_size = ($degree + 3) * 8; my $i = int(rand(length($in) - $frame_size)); pos($in) = $i; print "Taking a stab at pos $i\n" if $Debug; if( $in =~ m/$new_matcher/sg || $in =~ m/$new_matcher/sg ) # Yes, try TWICE! Magic, wooo. { $last_matched = $1; print "$last_matched matches $new_matcher.\n" if $Debug; push @out, $last_matched; } else { print "Can't get an initial $degree-token match" if $Debug; last; } } } # end while my $ret = join('', @out); #Remove everything after last sentence-closing punctuation. $ret =~ s/([.?!])[^.?!]*$/$1\n/s; # Never more than two newlines in a row. $ret=~s/\n\n+/\n\n/g; return $ret; } #========================================================================== 1;