#!/usr/bin/perl -w # scrub v0.1 # A tool for finding (and eradicating!) Blosxom comment spam. # # Edit $writeback_path below before using! # # Copyright (C) 2004 Jason Clark # http://jclark.org/weblog # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # To receive a copy of the GNU General Public License, # write to the Free Software Foundation, Inc., # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # # To learn more about the GNU General Public License, visit: # http://creativecommons.org/licenses/GPL/2.0/ use strict; use Getopt::Long; # Available on the CPAN: http://search.cpan.org/~rclamp/File-Find-Rule-0.28/lib/File/Find/Rule.pm use File::Find::Rule; # EDIT THE FOLLOWING to point to your writeback directory my $writeback_path = "/blosxom/plugins/state/writeback"; BEGIN { $/="-----\n"; $| = 1; } my ($regex, $listing, $showing, $counting, $scrubbing, $verbose, $filename, $helpme, @files); (my $prog) = ($0 =~ /([^\/]+$)/); my $usage =< [-list] [-show|-scrub] [-count] [-file ] Locates comment spam, and optionally cleans it up. -regex Required. Sets the regex used to identify spam comments. -list List filenames of comment files containing spam. -scrub Remove all spam comments from each file processed. -show Display each spam comment. -count Display count of files containing spam. -file Only search for spam. NOTE: -list and -scrub are mutually exclusive Examples: 1. List all files containing 'spam.com', display total: $prog -regex 'spam\.com' -list -count 2. Remove all comments containing 'spam.com', show progress via filenames $prog -regex 'spam\.com' -list -scrub 3. Show all files containing raw html hyperlinks, and the actual comments: $prog -regex ' \$regex, "list" => \$listing, "show" => \$showing, "count" => \$counting, "file=s" => \$filename, "verbose" => \$verbose, "scrub" => \$scrubbing, "help" => \$helpme, ); if ($helpme or !($listing or $showing or $counting or $scrubbing)) { print STDERR $usage; exit 1; } unless (defined $regex) { print STDERR "You must specify a regex.\n\n$usage"; exit 1; } if ($showing and $scrubbing) { print STDERR "Cannot use -show and -scrub concurrently. Sorry."; exit 1; } if ($verbose) { print <<"VERBOSE"; regex: $regex list : $listing show : $showing count: $counting file : $filename VERBOSE } if (defined $filename) { @files = ($filename); } else { $regex = qr"$regex"; @files = File::Find::Rule->file ->name('*.wb') ->grep($regex) ->in($writeback_path); } foreach my $file (@files) { if ($listing) { print "$file\n"; } if ($scrubbing) { local ($^I, @ARGV) = ('', ($file)); while (<>) { print $_ unless /$regex/; } } elsif ($showing) { open(FH, "<$file"); while() { print $_ if /$regex/; } close(FH); } } my $count = scalar(@files); if ($counting) { print "$count files matched.\n"; }