Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
//<syntaxhighlight language=perl>
#! /usr/bin/perl
use LWPx::ParanoidAgent;
use HTTP::Cookies;
use URI::Escape;
use Text::Align::WagnerFischer;
$ua = LWPx::ParanoidAgent->new(timeout => 20);
$ua->agent("CorenSearchBot/1.0 ");
$cookie_jar = HTTP::Cookies->new(file => "$ENV{'HOME'}/lwp_cookies.dat", autosave => 1,);
$ua->cookie_jar($cookie_jar);
sub Doing($) {
my($msg) = @_;
print "\t$msg\n";
}
sub significant($) {
my @in = split "\n", $_[0];
my @out;
foreach my $l (@in) {
next if $l =~ m/ Categor(y|ies) /;
next if $l =~ m/align/;
my $words = 0;
if($l =~ m/\b[a-z]{5,}\b/) {
$words++ while $l =~ m//g;
}
if($l =~ m/\b\*\b/) {
$words-=2 while $l =~ m//g;
}
next if $words < 3;
#$l .= " [$words]";
push @out, $l;
}
return @out;
}
sub complete($) {
my @in = split "\n", $_[0];
my @out;
foreach my $l (@in) {
next if $l =~ m/ Categor(y|ies) /;
push @out, $l;
}
return @out;
}
sub tokenize(@) {
my @t;
foreach my $l (@_) {
foreach my $t (split / /, $l) {
push @t, $t if length($t) > 3;
}
}
return @t;
}
sub statementize($) {
($_, undef) = @_;
s/---*/ /g;
tr/!-?/ /;
#s/ */ /g;
s/^ *//g;
s/ *$//g;
s/\*([^ .])/\1/g;
s/\. */.\n/g;
#while(s/([^. \n]) *([A-Z][a-zA-Z0-9_]*)/\1 */gs) { }
#while(s/\* *\*/* /gs) { }
s/\.([A-Z])/\n\1/sg;
s/ *\././g;
s/\n\n*/\n/gs;
s/\.\n/\n/gs;
return $_;
}
sub normalizewikitext($) {
($_, undef) = @_;
tr/*#/::/;
s/<ref>.*?<\/ref>/ /igs;
s/<.*?>/ /igs;
s/&[^;]*;/ /gs;
while(s/('''*)(.*?)\1/ \2 /gs) { }
s/\[\[([^|\]]*)]]/ \1 /gs;
s/\[\[.*?\|(.*?)]]/ \1 /gs;
s/\[[^ ]* (.*?)]/ \1 /gs;
s/\[.*?]/ /gs;
s/^(===*)(.*?)\1/\2. /g;
s/{{.*?}}/ /gs;
return statementize $_;
}
sub normalizewebtext($) {
($_, undef) = @_;
s/<.*?>/ /igs;
s/\&.*?;/ /gs;
return statementize $_;
}
sub WPRequest(@) {
my $req = HTTP::Request->new(POST => 'http://en.wikipedia.org/w/api.php');
$req->content_type('application/x-www-form-urlencoded');
$req->content(join '&', @_);
my $res = $ua->request($req);
return $res->is_success? $res->content: undef;
}
sub WPLogin($$) {
my ($uname, $pwd) = @_;
$pwd = uri_escape($pwd);
my $req = HTTP::Request->new(POST => 'http://en.wikipedia.org/w/index.php?title=Special:Userlogin&action=submitlogin&type=login');
$req->content_type('application/x-www-form-urlencoded');
$req->content("wpName=$uname&wpPassword=$pwd&wpRemember=1&wpLoginattempt=Log+in");
my $res = $ua->request($req);
$cookie_jar->extract_cookies($req);
return "Ok";
}
sub WPStartEdit($) {
my ($title) = @_;
$title = uri_escape($title);
my $req = HTTP::Request->new(GET => "http://en.wikipedia.org/w/index.php?title=$title&action=edit");
my $res = $ua->request($req);
my $txt;
$txt = $1 if $res->content =~ m/<textarea[^>]*>(.*)<\/textarea>/s;
$txt =~ s/</</gs;
$txt =~ s/>/>/gs;
$txt =~ s/"/"/gs;
$txt =~ s/&/\&/gs;
my $et;
$et = $1 if $res->content =~ m/<input type='hidden' value="([^"]*?)" name="wpEditToken" \/>/s;
my $more;
$more .= '&wpStarttime='.uri_escape($1) if $res->content =~ m/<input type='hidden' value="([^"]*?)" name="wpStarttime" \/>/s;
$more .= '&wpEdittime='.uri_escape($1) if $res->content =~ m/<input type='hidden' value="([^"]*?)" name="wpEdittime" \/>/s;
if($res->is_success) {
return ($title, $et, $more, $txt);
}
return undef;
}
sub WPTryEdit($$$$$) {
my($title, $et, $more, $txt, $es) = @_;
my $req = HTTP::Request->new(POST => "http://en.wikipedia.org/w/index.php?title=$title&action=submit");
$req->content_type('application/x-www-form-urlencoded');
$req->content(
'wpSection='
. '&wpSummary='.uri_escape($es)
. '&wpSave=wpSave'
. '&wpEditToken='.uri_escape($et)
. '&wpTextbox1='.uri_escape($txt)
. $more
);
my $res = $ua->request($req);
$et = $1 if $res->content =~ m/<input type='hidden' value="([^"]*?)" name="wpEditToken" \/>/s;
return undef if $res->content =~ m/<textarea/;
return 1;
}
sub WPArticle($) {
my($title) = @_;
my $art = WPRequest('action=query',
'prop=revisions',
'titles='.uri_escape($title),
'rvprop=content',
'rvlimit=1',
'format=xml');
$art = $1 if $art =~ m/<rev>(.*?)<\/rev>/s;
return $art;
}
sub WPNewPages() {
my $list = WPRequest('action=query',
'list=recentchanges',
'rclimit=500',
'rcnamespace=0',
'format=xml');
my @news;
my $maxrid = 0;
if($list =~ m/<rc type="1" .*? title="([^"]*)" .*? revid="([0-9]+)"/g) {
while(1) {
last if $2 <= $last_revid;
$maxrid = $2 if $2>$maxrid;
push @news, $1;
last if not $list =~ m//g;
}
}
$last_revid = $maxrid if $maxrid>$last_revid;
return @news;
}
sub WPCreator($) {
my($title) = @_;
my $art = WPRequest('action=query',
'prop=revisions',
'titles='.uri_escape($title),
'rvprop=user',
'rvlimit=1',
'rvdir=newer',
'format=xml');
return $1 if $art =~ m/<rev user="([^"]*?)" \/>/s;
return undef;
}
sub YahooFind($) {
my $req = HTTP::Request->new(GET => 'http://search.yahooapis.com/WebSearchService/V1/webSearch?appid=SANITIZED&query='.uri_escape(join(' ',@_)).'&results=5&language=en');
my $res = $ua->request($req);
my @uri;
my $r = $res->content;
$r =~ s/<Cache>.*?<\/Cache>//sg;
my @re = $r =~ m/<Url>([^<]*?)\/?<\/Url>/gs;
Doing "Search \"".join(' ',@_)."\" found $#re+1 results";
return @re;
}
sub top3($) {
my($q) = @_;
my @uri, YahooFind($q);
$#uri=2 if $#uri>2;
SITE:
foreach my $uri (@uri) {
next if $uri =~ m/\.[pP][Dd][Ff]/;
foreach my $q (@web) {
next SITE if $q eq $uri;
}
my $site;
$site = $1 if $uri =~ m{^[^:]*://([^/]*)/};
if($site eq 'en.wikipedia.org' and $uri=~m{/wiki/}) {
$uri =~ s{.*/wiki/(.*)}{\1};
$uri = uri_unescape($uri);
$uri =~ tr/_/ /;
foreach my $q (@enwiki) {
next SITE if $q eq $uri;
}
push @enwiki, $uri;
next SITE;
}
foreach my $re (@exclude) {
next SITE if $site =~ $re;
}
push @web, $uri;
return if $#web > 5;
}
}
sub findmatches($) {
my $article = WPArticle($_[0]);
my @atokens = tokenize complete normalizewikitext $article;
#print "article <", join(' ', @atokens), ">\n";
my @paras = significant normalizewikitext $article;
my $why = undef;
my $score = $config{MinScore};
my $what = undef;
my $what_ok;
my $score_ok = 50000;
local @web;
local @enwiki;
return undef if $#atokens < 5;
$#atokens = 200 if $#atokens > 200;
my @uri;
my $ln = 0;
my $title = $_[0];
$title =~ s/\(.*?\) *//;
foreach my $l (@paras) {
if($ln==1 or $ln==7 or $ln==($#paras-1)) {
if($l =~ m/ (.*)\.?/) {
my @tq = split ' ', $1;
my @q;
my $num = 0;
foreach my $w (@tq) {
push @q, $w if $w =~ m/[a-zA-Z0-9*]/;
$num++ if not $w eq '*';
last if $num > 9;
}
my $q = join ' ', @q;
top3 "\"$title\" $q";
}
}
$ln++;
}
return undef if $#paras < 0; top3 "\"$title\"";
foreach my $uri (@web) {
Doing "checking $uri";
my @src = eval {
local $SIG{ALRM} = sub { die "alarm\n" };
alarm 25;
my $req = HTTP::Request->new(GET => $uri);
alarm 0;
my $res = $ua->request($req);
if($res->is_success) {
my @src = tokenize complete normalizewebtext $res->content;
#print "webpage <", join(' ', @src), ">\n";
return @src if $#src > 9;
}
return undef;
};
next if $#src < 10;
next if $@ eq "alarm\n";
$#src = 100000/$#atokens if $#src*$#atokens > 100000;
my $alignment = Text::Align::WagnerFischer->new(
left => \@src,
right => \@atokens,
weights => [0,1,2]
);
my $maybe = 'pageincluded';
my $dif = abs ($#src-$#atokens);
$sina = ($alignment->cost()-$dif)*1000/$#src;
$ains = ($alignment->cost()-$dif)*1000/$#atokens;
Doing "$#src/$#atokens $dif gives cost ".($alignment->cost()-$dif)." for $sina/$ains";
if($ains > $sina) {
$maybe = 'pageincludes';
$sina = $ains;
}
my $need = $config{MinScore};
$need = ($need*$#atokens)/30 if $#atokens<30;
if($sina < $need and $sina < $score) {
$why = $maybe; $score = $sina;
$what = $uri;
}
if($sina < $score_ok) {
$score_ok = $sina;
$what_ok = $uri;
}
}
foreach $uri (@enwiki) {
next if $uri eq $_[0];
my $test = WPArticle($uri);
my @src = tokenize complete normalizewikitext $test;
next if $#src < 10;
my $alignment = Text::Align::WagnerFischer->new(
left => \@src,
right => \@atokens,
weights => [-1,1,2]
);
$sina = $alignment->cost()*1000/$#src;
$ains = $alignment->cost()*1000/$#atokens;
$sina = $ains if $ains < $sina;
if($sina<-400 and $sina < $score) {
$why = 'wikipage';
$what = $uri;
$score = $sina;
}
if($sina < $score_ok) {
$score_ok = $sina;
$what_ok = $uri;
}
}
return ($why, $what, ($score)/10) if $score < $config{MinScore};
Doing "Best match was $what_ok with $score_ok";
return ('', '', 1000);
}
sub TagPage($$$) {
my($title, $type, $what) = @_;
my $tag = "{{csb-$type|1=$what}}";
my $user = WPCreator($title);
foreach my $ally (@allies) {
return "creator trusted" if $user eq $ally;
}
$user = "User talk:$user" if defined $user;
while(1) {
my($ttl, $token, $more, $text) = WPStartEdit($title);
return "article is (now) a redirect" if $text =~ m/^#REDIRECT/;
return "attributed" if $text =~ m/{{DANFS}}/i;
return "attributed" if $text =~ m/{{[cC]atholic}}/i;
return "speedied" if $text =~ m/{{db/;
return "marked copyvio" if $text =~ m/{{copyvio/;
return "already tagged" if $text =~ m/{{csb-/;
return "page gone" if length($text)<20;
$text = "$tag\n\n" . $text;
if(WPTryEdit($ttl, $token, $more, $text, "Tagging for copyvio of $what"))
{
while(defined $user) {
($ttl, $token, $more, $text) = WPStartEdit($user);
$text .= "\n{{subst:csb-notice-$type|$title|url=$what}} — [[User:Coren|Coren]] <sup>[[User Talk:Coren|(talk)]]</sup> 22:41, 18 August 2007 (UTC)\n";
last if WPTryEdit($ttl, $token, $more, $text, "Notifying user of copyvio on $title");
}
while(1) {
($ttl, $token, $more, $text) = WPStartEdit($config{ReportTo});
my $re = qr/\[\[$title]]/s;
last if $text =~ $re;
if($type eq 'wikipage') {
$text .= "* [[$title]] — [[$what]]. Reported by [[User:CorenSearchBot|CSBot]] at 22:41, 18 August 2007 (UTC)\n";
} else {
$text .= "* [[$title]] — [$what $what]. Reported by [[User:CorenSearchBot|CSBot]] at 22:41, 18 August 2007 (UTC)\n";
}
last if WPTryEdit($ttl, $token, $more, $text, "Adding violation on $title");
}
return undef
}
}
}
sub configstatus() {
undef %config;
undef @exclude;
undef @allies;
foreach $l (split "\n", WPArticle("User:CorenSearchBot/config")) {
$config{$1} = $2 if $l =~ m/ *([A-Za-z]+)=(.*)/;
}
foreach $l (split "\n", WPArticle("User:CorenSearchBot/exclude")) {
push @exclude, qr/$1$/i if $l =~ m/ *([^=]*\.[a-z]{2,4})$/;
}
foreach $l (split "\n", WPArticle("User:CorenSearchBot/allies")) {
push @allies, $1 if $l =~ m/ *([^=]*)$/;
}
}
my @npq;
my $ok = WPLogin('CorenSearchBot', SANITIZED);
configstatus;
print "Configuration read.\n";
print "(", $#exclude+1, " exclusions)\n";
print "(", $#allies+1, " allies)\n";
print "Report to '$config{ReportTo}'\n";
print "Is a copy below $config{MinScore}\n";
print "\n";
push @npq, @ARGV;
my @manuals;
while(1) {
if($#npq < 1) {
print "Fetching new pages\n";
push @npq, WPNewPages if $#npq < 1;
print $#npq+1, " page(s) to check. (last revid $last_revid)\n";
if($#npq<0) {
if($#manuals<0) {
foreach $l (split "\n", WPArticle("User:CorenSearchBot/manual")) {
push @manuals, $1 if $l =~ m/\[\[([^]]*)]]$/;
}
while($#manuals >= 0) { my ($ttl, $token, $more, $text) = WPStartEdit("User:CorenSearchBot/manual");
$text =~ s/==Unprocessed requests==.*==Recent Results==/==Unprocessed requests==\n\n==Recent Results==/s;
last if WPTryEdit($ttl, $token, $more, $text, "Removing pending requests");
}
}
if($#manuals>=0) {
my $page = pop @manuals;
my $result = "{{User:CorenSearchBot/result-no|$page|22:41, 18 August 2007 (UTC)}}\n";
print "Manually checking [[$page]]\n";
my($why, $what, $score) = findmatches($page);
$score = int(100-$score);
$result = "{{User:CorenSearchBot/result-unknown|$page|22:41, 18 August 2007 (UTC)}}\n" if $score>-10;
if(defined $why and not $why eq '') {
print "\t\033[31;1m[[$page]] is $why of [$what] with confidence $score\033[0m\n";
$result = "{{User:CorenSearchBot/result-yes|$page|$score|22:41, 18 August 2007 (UTC)|url=$what}}\n";
}
while(1) {
my ($ttl, $token, $more, $text) = WPStartEdit("User:CorenSearchBot/results");
$text .= $result;
last if WPTryEdit($ttl, $token, $more, $text, "Posting result of manual check");
}
} else {
print "Sleeping.\n";
sleep 20;
configstatus;
}
}
}
if($#npq >= 0) {
my $page = $npq[0];
shift @npq;
print "Checking [[$page]]\n";
my($why, $what, $score) = findmatches($page);
if(defined $why and not $why eq '') {
$score = int(100-$score);
print "\t\033[31;1m[[$page]] is $why of [$what] with confidence $score\033[0m\n";
my $res = TagPage($page, $why, $what);
if(defined $res) {
print "\tTagging: $res\n";
} else {
print "\tTags placed\n";
}
}
}
}
//</syntaxhighlight>
You must be logged in to post a comment.