#!/usr/bin/perl -w
# scrub v0.1
# A tool for finding (and eradicating!) Blosxom comment spam.
#
# Edit $writeback_path below before using!
# 
# Copyright (C) 2004  Jason Clark
# http://jclark.org/weblog
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# To receive a copy of the GNU General Public License, 
# write to the Free Software Foundation, Inc., 
# 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
# To learn more about the GNU General Public License, visit:
#   http://creativecommons.org/licenses/GPL/2.0/

use strict;
use Getopt::Long; 

# Available on the CPAN: http://search.cpan.org/~rclamp/File-Find-Rule-0.28/lib/File/Find/Rule.pm
use File::Find::Rule;

# EDIT THE FOLLOWING to point to your writeback directory
my $writeback_path = "/blosxom/plugins/state/writeback";

BEGIN {
    $/="-----\n";
    $| = 1;
}

my ($regex, $listing, $showing, $counting, $scrubbing, $verbose, $filename, $helpme, @files);

(my $prog) = ($0 =~ /([^\/]+$)/);

my $usage =<<END;
$prog -regex <regex> [-list] [-show|-scrub] [-count] [-file <filenmame>]

Locates comment spam, and optionally cleans it up.

  -regex <regex>   Required.  Sets the regex used to identify spam comments.
  -list            List filenames of comment files containing spam.
  -scrub           Remove all spam comments from each file processed.
  -show            Display each spam comment.
  -count           Display count of files containing spam.
  -file <filename> Only search <filename> for spam.

NOTE:  -list and -scrub are mutually exclusive

Examples:
1. List all files containing 'spam.com', display total:
 $prog -regex 'spam\.com' -list -count

2. Remove all comments containing 'spam.com', show progress via filenames
 $prog -regex 'spam\.com' -list -scrub 

3. Show all files containing raw html hyperlinks, and the actual comments:
 $prog -regex '<a href' -list -show

END

GetOptions("regex=s" => \$regex,
	   "list"    => \$listing,
	   "show"    => \$showing,
	   "count"   => \$counting,
	   "file=s"    => \$filename,
           "verbose" => \$verbose,
	   "scrub" => \$scrubbing,
	   "help"  => \$helpme,
	   );

if ($helpme or !($listing or $showing or $counting or $scrubbing)) {
    print STDERR $usage;
    exit 1;
}

unless (defined $regex) {
    print STDERR "You must specify a regex.\n\n$usage";
    exit 1;
}

if ($showing and $scrubbing) {
    print STDERR "Cannot use -show and -scrub concurrently.  Sorry.";
    exit 1;
}

if ($verbose) {
    print <<"VERBOSE";
regex: $regex
list : $listing
show : $showing
count: $counting
file : $filename
VERBOSE
}

if (defined $filename) {
    @files = ($filename);
} else {
    $regex = qr"$regex";
    @files = File::Find::Rule->file
	                     ->name('*.wb')
			     ->grep($regex)
			     ->in($writeback_path);
}

foreach my $file (@files) {
    if ($listing) { 
	print "$file\n";
    }

    if ($scrubbing) {
	local ($^I, @ARGV) = ('', ($file));
	while (<>) {
	    print $_ unless /$regex/;
	}
    } elsif ($showing) {
	open(FH, "<$file");
	while(<FH>) {
	    print $_ if /$regex/;
	}
	close(FH);
    }	


}

my $count = scalar(@files);
if ($counting) {
    print "$count files matched.\n";
}
