#!/usr/bin/perl
#
# Date:     12.07.2003
# Author:   Tony Watson 
# Email:    paw@paw.org
# Homepage: www.terrorist.net
# Title:    News Crawler v2.0
# Description:	Gather Internet & Tech articles from various sources.  Modular in design so 
#		I can add or modify new sites without much effort.  This is a re-write of
#		the original "securitycrawler/newscrawler" scripts I wrote back in 1999 which
#		I lost the source code to...
#		
#		No includes or require modules!!! Simplicity is so wonderful.
#
$timestamp=`/bin/date`;

&MAKEHTML(SLASHDOT);
&MAKEHTML(WIRED);
&MAKEHTML(REUTERS);
&MAKEHTML(YAHOO);
&MAKEHTML(CNET1);
&MAKEHTML(CNET2);

sub MAKEHTML {
  my $SOURCE = $_[0];
  @articles=&$SOURCE;
  $count=@articles;
  $count--;	# I should fix this, I increment $articles before I stuff the @news array in each subroutine
  ($num,$src,$url,$title,$link)=split(/\t/,@articles[1]);
  print "<font face=\"Arial\" size=4>$src </font>\n";
  print "<font face=\"Arial\" size=2>($count articles) </font>\n";
  print "<font face=\"Arial Black\" size=1>Last updated $timestamp</font>\n";
  print "<font face=\"Arial Narrow\" size=3>\n";
  while (($num,$src,$url,$title,$link)=split(/\t/,pop(@articles))) {
    chomp($link);
    #$link=~s/http:\/\///;
    #$byte1=pack("S1",1);
    #$link="http://www.terrorist.net".$byte1."%00@".$link;
    print "<li><a href=\"$link\">$title<\/a>\n";
  }
  print "</font><p>\n";
}
  
  

sub YAHOO {
  system("wget -q \"http://news.yahoo.com/news?tmpl=index&cid=1212\" -O /tmp/yahoo.$$");
  open(IN,"/tmp/yahoo.$$");
  my $gather=0;
  my $articles=0;
  my $line="";
  my $lastline="";
  my $link="";
  my $headline="";
  my @news="";
  my $source="YAHOO";
  my $sourceurl="http://news.yahoo.com";
  while ($line=<IN>) {
    chomp($line);
    if ($line=~/class=topstory/) {
      $articles++;
      $headline=$line;
      $headline=~s/.*story>//;
      $headline=~s/<\/a>//;
      $headline=~s/<\/b>//;
      $link=$lastline;
      $link=~s/.*href="(.*)".*/$1/;
      push(@news,"$articles\t$source\t$sourceurl\t$headline\t$link\n");
      }
    $lastline=$line;
    }
  close(IN);
  unlink("/tmp/yahoo.$$");
  return @news;
}

sub WIRED {
  system("wget -q \"http://www.wired.com\" -O /tmp/wired.$$");
  open(IN,"/tmp/wired.$$");
  my $gather=0;
  my $articles=0;
  my $line="";
  my $link="";
  my $headline="";
  my @news="";
  my $source="WIRED";
  my $sourceurl="http://www.wired.com";
  while ($line=<IN>) {
    chomp($line);
    if ($line=~/<!-- BEGIN Main Stories -->/)
      { $gather=1; }
    if (($gather==1)&&($line=~/<!-- END Story List -->/))
      { $gather=0; break; }
    if (($gather==1)&&($line=~/href=.*html.*hd">/)) {
      $articles++;
      $headline=$line;
      $headline=~s/.*hd">(.*)<\/a.*/$1/;
      $link=$line;
      $link=~s/.*href='(.*)' class.*/$1/;
      $link2=&URLDecode($link);
      $link2="http://www.wired.com".$link2;
      push(@news,"$articles\t$source\t$sourceurl\t$headline\t$link2\n");
      }
    }
  close(IN);
  unlink("/tmp/wired.$$");
  return @news;
}

sub REUTERS {
  system("wget -q \"http://www.reuters.com/newsChannel.jhtml?type=internetNews\" -O /tmp/reuters.$$");
  open(IN,"/tmp/reuters.$$");
  my $gather=0;
  my $articles=0;
  my $line="";
  my $link="";
  my $headline="";
  my @news="";
  my $source="REUTERS";
  my $sourceurl="http://www.reuters.com";
  while ($line=<IN>) {
    chomp($line);
    if ($line=~/<a class="newsHeadlineLarge"/) {
      $articles++;
      $headline=$line;
      $headline=~s/.*>(.*)<\/a.*/$1/;
      $link=$line;
      $link=~s/.*href="(.*)" target.*/$1/;
      $link="http://www.reuters.com/".$link;
      push(@news,"$articles\t$source\t$sourceurl\t$headline\t$link\n");
      }
    }
  close(IN);
  open(IN,"/tmp/reuters.$$");
  while ($line=<IN>) {
    chomp($line);
    if ($line=~/alt_sideHeadline/) {
      $articles++;
      $headline=$line;
      $headline=~s/.*">(.*)<\/a.*/$1/;
      $link=$line;
      $link=~s/.*href="(.*)" target.*/$1/;
      $link="http://www.reuters.com/".$link;
 push(@news,"$articles\t$source\t$sourceurl\t$headline\t$link\n");
      }
    }
  close(IN);
  unlink("/tmp/reuters.$$");
  return @news;
}

sub CNET1 {
  system("wget -q \"http://news.com.com\" -O /tmp/cnet1.$$");
  open(IN,"/tmp/cnet1.$$");
  my $gather=0;
  my $articles=0;
  my $line="";
  my $link="";
  my $headline="";
  my @news="";
  my $source="CNET";
  my $sourceurl="http://news.com.com";
  while ($line=<IN>) {
    chomp($line);
    if ($line=~/<DIV id=todaystopheadlines>/)
      { $gather=1; }
    if (($gather==1)&&($line=~/<DIV id=recentheadlines>/))
      { $gather=0; break; }
    if (($gather==1)&&($line=~/href=.*html/)) {
      $articles++;
      $headline=$line;
      $headline=~s/.*">(.*)<\/a.*/$1/;
      $headline=~s/.*<IMG.*>//;
      $headline=~s/.*<img.*>//;
      $headline=~s/<br \/>//;
      if (length($headline)>4) {
        $link=$line;
        $link=~s/.*href="(.*?)">.*/$1/;
        $link2=&URLDecode($link);
        $link2="http://news.com.com".$link2;
        push(@news,"$articles\t$source\t$sourceurl\t$headline\t$link2\n");
        }
      }
  }
  close(IN);
  unlink("/tmp/cnet1.$$");
  return @news;
}

sub CNET2 {
  system("wget -q \"http://news.com.com\" -O /tmp/cnet2.$$");
  open(IN,"/tmp/cnet2.$$");
  my $gather=0;
  my $articles=0;
  my $line="";
  my $link="";
  my $headline="";
  my @news="";
  my $source="VARIOUS SOURCES";
  my $sourceurl="http://www.terrorist.net";
  while ($line=<IN>) {
    chomp($line); 
    if ($line=~/<!-- More News from Around the Web -->/)
      { $gather=1; }
    if (($gather==1)&&($line=~/<!-- \/More News from Around the Web -->/))
      { $gather=0; break; }
    if (($gather==1)&&($line=~/destUrl=http/)) {
      $articles++;
      $headline=$line;
      $headline=~s/.*<b>(.*)<\/b>.*/$1/;
      $link=$line;
      #
      # Change contributed by Kelso <dkkelso@yahoo.com>
      # to fix the format change at CNET.  4/23/04
      #
      #$link=~s/.*destUrl=(.*)\&edId.*/$1/;   
      $link=~s/.*&destUrl=(.*)">.*/$1/;
      $link2="";
      $link2=&URLDecode($link);
      push(@news,"$articles\t$source\t$sourceurl\t$headline\t$link2\n");
      }
  }   
  close(IN);
  unlink("/tmp/cnet2.$$");
  return @news;
}     

sub SLASHDOT {
  system("wget -q http://slashdot.org/slashdot.rdf -O /tmp/slashdot.$$");
  open(IN,"/tmp/slashdot.$$");
  my $articles=0;
  my @news="";
  my $source="SLASHDOT";
  my $sourceurl="http://www.slashdot.org";
  while ($line=<IN>) {
    chomp($line); 
    if ($line=~/<item>/i) { $initem=1; }
    if ($line=~/<title>/i) { $headline=$line; $headline=~s/.*<title>(.*)<\/title>.*/$1/; }
    if ($line=~/<link>/i) { $link=$line; $link=~s/.*<link>(.*)<\/link>.*/$1/; }
    if ($line=~/<\/item/i) { 
      $articles++;
      push(@news,"$articles\t$source\t$sourceurl\t$headline\t$link\n");
      $initem=0;
    }
  }
  close(IN);
  unlink("/tmp/slashdot.$$");
  return @news;
}

sub URLDecode {
    my $theURL = $_[0];
    $theURL =~ tr/+/ /;
    $theURL =~ s/%([a-fA-F0-9]{2,2})/chr(hex($1))/eg;
    $theURL =~ s/<!--(.|\n)*-->//g;
    return $theURL;
}

sub URLEncode {
   my $theURL = $_[0];
   $theURL =~ s/([\W])/"%" . uc(sprintf("%2.2x",ord($1)))/eg;
   return $theURL;
}


