#!/usr/bin/perl
#
# Date: 12.07.2003
# Author: Tony Watson
# Email: paw@paw.org
# Homepage: www.terrorist.net
# Title: News Crawler v2.0
# Description: Gather Internet & Tech articles from various sources. Modular in design so
# I can add or modify new sites without much effort. This is a re-write of
# the original "securitycrawler/newscrawler" scripts I wrote back in 1999 which
# I lost the source code to...
#
# No includes or require modules!!! Simplicity is so wonderful.
#
$timestamp=`/bin/date`;
&MAKEHTML(SLASHDOT);
&MAKEHTML(WIRED);
&MAKEHTML(REUTERS);
&MAKEHTML(YAHOO);
&MAKEHTML(CNET1);
&MAKEHTML(CNET2);
sub MAKEHTML {
my $SOURCE = $_[0];
@articles=&$SOURCE;
$count=@articles;
$count--; # I should fix this, I increment $articles before I stuff the @news array in each subroutine
($num,$src,$url,$title,$link)=split(/\t/,@articles[1]);
print "$src \n";
print "($count articles) \n";
print "Last updated $timestamp \n";
print "\n";
while (($num,$src,$url,$title,$link)=split(/\t/,pop(@articles))) {
chomp($link);
#$link=~s/http:\/\///;
#$byte1=pack("S1",1);
#$link="http://www.terrorist.net".$byte1."%00@".$link;
print "$title<\/a>\n";
}
print "
\n";
}
sub YAHOO {
system("wget -q \"http://news.yahoo.com/news?tmpl=index&cid=1212\" -O /tmp/yahoo.$$");
open(IN,"/tmp/yahoo.$$");
my $gather=0;
my $articles=0;
my $line="";
my $lastline="";
my $link="";
my $headline="";
my @news="";
my $source="YAHOO";
my $sourceurl="http://news.yahoo.com";
while ($line=) {
chomp($line);
if ($line=~/class=topstory/) {
$articles++;
$headline=$line;
$headline=~s/.*story>//;
$headline=~s/<\/a>//;
$headline=~s/<\/b>//;
$link=$lastline;
$link=~s/.*href="(.*)".*/$1/;
push(@news,"$articles\t$source\t$sourceurl\t$headline\t$link\n");
}
$lastline=$line;
}
close(IN);
unlink("/tmp/yahoo.$$");
return @news;
}
sub WIRED {
system("wget -q \"http://www.wired.com\" -O /tmp/wired.$$");
open(IN,"/tmp/wired.$$");
my $gather=0;
my $articles=0;
my $line="";
my $link="";
my $headline="";
my @news="";
my $source="WIRED";
my $sourceurl="http://www.wired.com";
while ($line=) {
chomp($line);
if ($line=~//)
{ $gather=1; }
if (($gather==1)&&($line=~//))
{ $gather=0; break; }
if (($gather==1)&&($line=~/href=.*html.*hd">/)) {
$articles++;
$headline=$line;
$headline=~s/.*hd">(.*)<\/a.*/$1/;
$link=$line;
$link=~s/.*href='(.*)' class.*/$1/;
$link2=&URLDecode($link);
$link2="http://www.wired.com".$link2;
push(@news,"$articles\t$source\t$sourceurl\t$headline\t$link2\n");
}
}
close(IN);
unlink("/tmp/wired.$$");
return @news;
}
sub REUTERS {
system("wget -q \"http://www.reuters.com/newsChannel.jhtml?type=internetNews\" -O /tmp/reuters.$$");
open(IN,"/tmp/reuters.$$");
my $gather=0;
my $articles=0;
my $line="";
my $link="";
my $headline="";
my @news="";
my $source="REUTERS";
my $sourceurl="http://www.reuters.com";
while ($line=) {
chomp($line);
if ($line=~/(.*)<\/a.*/$1/;
$link=$line;
$link=~s/.*href="(.*)" target.*/$1/;
$link="http://www.reuters.com/".$link;
push(@news,"$articles\t$source\t$sourceurl\t$headline\t$link\n");
}
}
close(IN);
open(IN,"/tmp/reuters.$$");
while ($line=) {
chomp($line);
if ($line=~/alt_sideHeadline/) {
$articles++;
$headline=$line;
$headline=~s/.*">(.*)<\/a.*/$1/;
$link=$line;
$link=~s/.*href="(.*)" target.*/$1/;
$link="http://www.reuters.com/".$link;
push(@news,"$articles\t$source\t$sourceurl\t$headline\t$link\n");
}
}
close(IN);
unlink("/tmp/reuters.$$");
return @news;
}
sub CNET1 {
system("wget -q \"http://news.com.com\" -O /tmp/cnet1.$$");
open(IN,"/tmp/cnet1.$$");
my $gather=0;
my $articles=0;
my $line="";
my $link="";
my $headline="";
my @news="";
my $source="CNET";
my $sourceurl="http://news.com.com";
while ($line=) {
chomp($line);
if ($line=~//)
{ $gather=1; }
if (($gather==1)&&($line=~/
/))
{ $gather=0; break; }
if (($gather==1)&&($line=~/href=.*html/)) {
$articles++;
$headline=$line;
$headline=~s/.*">(.*)<\/a.*/$1/;
$headline=~s/.*
//;
$headline=~s/.*//;
$headline=~s/ //;
if (length($headline)>4) {
$link=$line;
$link=~s/.*href="(.*?)">.*/$1/;
$link2=&URLDecode($link);
$link2="http://news.com.com".$link2;
push(@news,"$articles\t$source\t$sourceurl\t$headline\t$link2\n");
}
}
}
close(IN);
unlink("/tmp/cnet1.$$");
return @news;
}
sub CNET2 {
system("wget -q \"http://news.com.com\" -O /tmp/cnet2.$$");
open(IN,"/tmp/cnet2.$$");
my $gather=0;
my $articles=0;
my $line="";
my $link="";
my $headline="";
my @news="";
my $source="VARIOUS SOURCES";
my $sourceurl="http://www.terrorist.net";
while ($line=) {
chomp($line);
if ($line=~//)
{ $gather=1; }
if (($gather==1)&&($line=~//))
{ $gather=0; break; }
if (($gather==1)&&($line=~/destUrl=http/)) {
$articles++;
$headline=$line;
$headline=~s/.*(.*)<\/b>.*/$1/;
$link=$line;
#
# Change contributed by Kelso
# to fix the format change at CNET. 4/23/04
#
#$link=~s/.*destUrl=(.*)\&edId.*/$1/;
$link=~s/.*&destUrl=(.*)">.*/$1/;
$link2="";
$link2=&URLDecode($link);
push(@news,"$articles\t$source\t$sourceurl\t$headline\t$link2\n");
}
}
close(IN);
unlink("/tmp/cnet2.$$");
return @news;
}
sub SLASHDOT {
system("wget -q http://slashdot.org/slashdot.rdf -O /tmp/slashdot.$$");
open(IN,"/tmp/slashdot.$$");
my $articles=0;
my @news="";
my $source="SLASHDOT";
my $sourceurl="http://www.slashdot.org";
while ($line=) {
chomp($line);
if ($line=~/- /i) { $initem=1; }
if ($line=~/
/i) { $headline=$line; $headline=~s/.*(.*)<\/title>.*/$1/; }
if ($line=~/ /i) { $link=$line; $link=~s/.* (.*)<\/link>.*/$1/; }
if ($line=~/<\/item/i) {
$articles++;
push(@news,"$articles\t$source\t$sourceurl\t$headline\t$link\n");
$initem=0;
}
}
close(IN);
unlink("/tmp/slashdot.$$");
return @news;
}
sub URLDecode {
my $theURL = $_[0];
$theURL =~ tr/+/ /;
$theURL =~ s/%([a-fA-F0-9]{2,2})/chr(hex($1))/eg;
$theURL =~ s///g;
return $theURL;
}
sub URLEncode {
my $theURL = $_[0];
$theURL =~ s/([\W])/"%" . uc(sprintf("%2.2x",ord($1)))/eg;
return $theURL;
}