sub get_news_for_topic {
my $topic = $_[0];
my $url = "http://news.google.com.tw/news?hl=zh-TW&ned=ttw&q=$topic";
my $ua = LWP::UserAgent->new();
$ua->agent('Mozilla/5.0');
my $response = $ua->get($url);
return unless $response->is_success;
my $content = $response->decoded_content;
$content = $response->content if (not defined $content);
#$content = encode('utf-8', decode('big5', $content));
$content =~ s/<font color=\"\#CC0033\">//g;
$content =~ s/<\/font>//g;
my $tree = HTML::TreeBuilder::XPath->new;
$tree->parse($content);
$tree->eof;
#$tree->dump;
my $items=$tree->findnodes_as_string( '//div[@class[.=~/story/]]');
my @class = split("\n",$items);
# print $class[1];
my $stu;
my ($href,$title,$date,$st,$ID);
my %result;
$ID =0;
for( my $i=0; $i <= $#class; $i++){
if($class[$i] =~ m/href=\"(\S+)\"/ ){ $href = $1; $stu=1; }
if($class[$i] =~ m/target=\"_blank\">([^<.]*)<\/a><\/h2>/ ){ $title = $1; $stu=2; }
if($class[$i] =~ m/span class=\"date \">‎([^<.]*)<\/span><\/div>/){ $date = $1; $stu=3; }
if($class[$i] =~ m/class=\"snippet\">([^<.]*)<b>\.\.\.<\/b>/ ){ $st = $1; $stu=4; }
if($stu==4){
$result{$ID} = {
"href" => $href,
"title" => $title,
"date" => $date,
"st" => $st,
};
$ID++;
}
}
$tree->delete;
return \%result;
}
結果
date::2010年4月21日
title::富比世全球2000大企業 台灣39家入榜 鴻海排名最高
st:: 排名最高的是第176 名的電子代工龍頭鴻海(2317-TW),其次分別為台積電(2330-TW)、台塑(1301-TW)、中鋼(2002-TW)、中華電信(2412-TW) 與友達(2409-TW)等。 特別的是,總排名第810 名的宏碁(2353-TW),同時也列名130 家「全球成長表現優異」(Global High Performer) 排行榜
href::http://news.cnyes.com/Content/20100422/KC8Q1MUMP84VU.shtml?c=tw_stock
sample code download
http://sites.google.com/site/funningboy/perl_code/GoogleNews.pl?attredirects=0&d=1
my $topic = $_[0];
my $url = "http://news.google.com.tw/news?hl=zh-TW&ned=ttw&q=$topic";
my $ua = LWP::UserAgent->new();
$ua->agent('Mozilla/5.0');
my $response = $ua->get($url);
return unless $response->is_success;
my $content = $response->decoded_content;
$content = $response->content if (not defined $content);
#$content = encode('utf-8', decode('big5', $content));
$content =~ s/<font color=\"\#CC0033\">//g;
$content =~ s/<\/font>//g;
my $tree = HTML::TreeBuilder::XPath->new;
$tree->parse($content);
$tree->eof;
#$tree->dump;
my $items=$tree->findnodes_as_string( '//div[@class[.=~/story/]]');
my @class = split("\n",$items);
# print $class[1];
my $stu;
my ($href,$title,$date,$st,$ID);
my %result;
$ID =0;
for( my $i=0; $i <= $#class; $i++){
if($class[$i] =~ m/href=\"(\S+)\"/ ){ $href = $1; $stu=1; }
if($class[$i] =~ m/target=\"_blank\">([^<.]*)<\/a><\/h2>/ ){ $title = $1; $stu=2; }
if($class[$i] =~ m/span class=\"date \">‎([^<.]*)<\/span><\/div>/){ $date = $1; $stu=3; }
if($class[$i] =~ m/class=\"snippet\">([^<.]*)<b>\.\.\.<\/b>/ ){ $st = $1; $stu=4; }
if($stu==4){
$result{$ID} = {
"href" => $href,
"title" => $title,
"date" => $date,
"st" => $st,
};
$ID++;
}
}
$tree->delete;
return \%result;
}
沒有留言:
張貼留言