#!/usr/bin/perl # # Date: 12.07.2003 # Author: Tony Watson # Email: paw@paw.org # Homepage: www.terrorist.net # Title: News Crawler v2.0 # Description: Gather Internet & Tech articles from various sources. Modular in design so # I can add or modify new sites without much effort. This is a re-write of # the original "securitycrawler/newscrawler" scripts I wrote back in 1999 which # I lost the source code to... # # No includes or require modules!!! Simplicity is so wonderful. # $timestamp=`/bin/date`; &MAKEHTML(SLASHDOT); &MAKEHTML(WIRED); &MAKEHTML(REUTERS); &MAKEHTML(YAHOO); &MAKEHTML(CNET1); &MAKEHTML(CNET2); sub MAKEHTML { my $SOURCE = $_[0]; @articles=&$SOURCE; $count=@articles; $count--; # I should fix this, I increment $articles before I stuff the @news array in each subroutine ($num,$src,$url,$title,$link)=split(/\t/,@articles[1]); print "$src \n"; print "($count articles) \n"; print "Last updated $timestamp\n"; print "\n"; while (($num,$src,$url,$title,$link)=split(/\t/,pop(@articles))) { chomp($link); #$link=~s/http:\/\///; #$byte1=pack("S1",1); #$link="http://www.terrorist.net".$byte1."%00@".$link; print "
  • $title<\/a>\n"; } print "

    \n"; } sub YAHOO { system("wget -q \"http://news.yahoo.com/news?tmpl=index&cid=1212\" -O /tmp/yahoo.$$"); open(IN,"/tmp/yahoo.$$"); my $gather=0; my $articles=0; my $line=""; my $lastline=""; my $link=""; my $headline=""; my @news=""; my $source="YAHOO"; my $sourceurl="http://news.yahoo.com"; while ($line=) { chomp($line); if ($line=~/class=topstory/) { $articles++; $headline=$line; $headline=~s/.*story>//; $headline=~s/<\/a>//; $headline=~s/<\/b>//; $link=$lastline; $link=~s/.*href="(.*)".*/$1/; push(@news,"$articles\t$source\t$sourceurl\t$headline\t$link\n"); } $lastline=$line; } close(IN); unlink("/tmp/yahoo.$$"); return @news; } sub WIRED { system("wget -q \"http://www.wired.com\" -O /tmp/wired.$$"); open(IN,"/tmp/wired.$$"); my $gather=0; my $articles=0; my $line=""; my $link=""; my $headline=""; my @news=""; my $source="WIRED"; my $sourceurl="http://www.wired.com"; while ($line=) { chomp($line); if ($line=~//) { $gather=1; } if (($gather==1)&&($line=~//)) { $gather=0; break; } if (($gather==1)&&($line=~/href=.*html.*hd">/)) { $articles++; $headline=$line; $headline=~s/.*hd">(.*)<\/a.*/$1/; $link=$line; $link=~s/.*href='(.*)' class.*/$1/; $link2=&URLDecode($link); $link2="http://www.wired.com".$link2; push(@news,"$articles\t$source\t$sourceurl\t$headline\t$link2\n"); } } close(IN); unlink("/tmp/wired.$$"); return @news; } sub REUTERS { system("wget -q \"http://www.reuters.com/newsChannel.jhtml?type=internetNews\" -O /tmp/reuters.$$"); open(IN,"/tmp/reuters.$$"); my $gather=0; my $articles=0; my $line=""; my $link=""; my $headline=""; my @news=""; my $source="REUTERS"; my $sourceurl="http://www.reuters.com"; while ($line=) { chomp($line); if ($line=~/(.*)<\/a.*/$1/; $link=$line; $link=~s/.*href="(.*)" target.*/$1/; $link="http://www.reuters.com/".$link; push(@news,"$articles\t$source\t$sourceurl\t$headline\t$link\n"); } } close(IN); open(IN,"/tmp/reuters.$$"); while ($line=) { chomp($line); if ($line=~/alt_sideHeadline/) { $articles++; $headline=$line; $headline=~s/.*">(.*)<\/a.*/$1/; $link=$line; $link=~s/.*href="(.*)" target.*/$1/; $link="http://www.reuters.com/".$link; push(@news,"$articles\t$source\t$sourceurl\t$headline\t$link\n"); } } close(IN); unlink("/tmp/reuters.$$"); return @news; } sub CNET1 { system("wget -q \"http://news.com.com\" -O /tmp/cnet1.$$"); open(IN,"/tmp/cnet1.$$"); my $gather=0; my $articles=0; my $line=""; my $link=""; my $headline=""; my @news=""; my $source="CNET"; my $sourceurl="http://news.com.com"; while ($line=) { chomp($line); if ($line=~/

    /) { $gather=1; } if (($gather==1)&&($line=~/
    /)) { $gather=0; break; } if (($gather==1)&&($line=~/href=.*html/)) { $articles++; $headline=$line; $headline=~s/.*">(.*)<\/a.*/$1/; $headline=~s/.*//; $headline=~s/.*//; $headline=~s/
    //; if (length($headline)>4) { $link=$line; $link=~s/.*href="(.*?)">.*/$1/; $link2=&URLDecode($link); $link2="http://news.com.com".$link2; push(@news,"$articles\t$source\t$sourceurl\t$headline\t$link2\n"); } } } close(IN); unlink("/tmp/cnet1.$$"); return @news; } sub CNET2 { system("wget -q \"http://news.com.com\" -O /tmp/cnet2.$$"); open(IN,"/tmp/cnet2.$$"); my $gather=0; my $articles=0; my $line=""; my $link=""; my $headline=""; my @news=""; my $source="VARIOUS SOURCES"; my $sourceurl="http://www.terrorist.net"; while ($line=) { chomp($line); if ($line=~//) { $gather=1; } if (($gather==1)&&($line=~//)) { $gather=0; break; } if (($gather==1)&&($line=~/destUrl=http/)) { $articles++; $headline=$line; $headline=~s/.*(.*)<\/b>.*/$1/; $link=$line; # # Change contributed by Kelso # to fix the format change at CNET. 4/23/04 # #$link=~s/.*destUrl=(.*)\&edId.*/$1/; $link=~s/.*&destUrl=(.*)">.*/$1/; $link2=""; $link2=&URLDecode($link); push(@news,"$articles\t$source\t$sourceurl\t$headline\t$link2\n"); } } close(IN); unlink("/tmp/cnet2.$$"); return @news; } sub SLASHDOT { system("wget -q http://slashdot.org/slashdot.rdf -O /tmp/slashdot.$$"); open(IN,"/tmp/slashdot.$$"); my $articles=0; my @news=""; my $source="SLASHDOT"; my $sourceurl="http://www.slashdot.org"; while ($line=) { chomp($line); if ($line=~//i) { $initem=1; } if ($line=~//i) { $headline=$line; $headline=~s/.*<title>(.*)<\/title>.*/$1/; } if ($line=~/<link>/i) { $link=$line; $link=~s/.*<link>(.*)<\/link>.*/$1/; } if ($line=~/<\/item/i) { $articles++; push(@news,"$articles\t$source\t$sourceurl\t$headline\t$link\n"); $initem=0; } } close(IN); unlink("/tmp/slashdot.$$"); return @news; } sub URLDecode { my $theURL = $_[0]; $theURL =~ tr/+/ /; $theURL =~ s/%([a-fA-F0-9]{2,2})/chr(hex($1))/eg; $theURL =~ s/<!--(.|\n)*-->//g; return $theURL; } sub URLEncode { my $theURL = $_[0]; $theURL =~ s/([\W])/"%" . uc(sprintf("%2.2x",ord($1)))/eg; return $theURL; }