從hao123提取鏈接,分析響應報文頭,確定是用哪種伺服器[轉]
轉自
---------- Forwarded message ----------
From: 陳學芹 <robbiecn at gmail.com>
Date: 2007-8-12 下午4:53
Subject: 用Perl統計hao123網址之家首頁的網站WEB伺服器種類
***************
SERVER RAW DATA
***************
888.89178.com : Apache/2.2.0 (Unix) DAV/2 PHP/5.2.1
91.shgao.com : Apache
adfarm.mediaplex.com: Apache-Coyote/1.1
allyesbjafa.allyes.com:
auto.sina.com.cn: Apache/2.0.58 (Unix)
baby.sina.com.cn: Apache/2.0.58 (Unix)
baike.baidu.com: apache 1.0.4.0
blog.sina.com.cn: Nginx/0.5.30
cang.baidu.com : apache 1.0.7.1
china.nba.com : Apache/1.3.34 (Debian) mod_layout/3.2.1
chinahrafaad.allyes.com: Microsoft-IIS/6.0
club.sohu.com : Apache/2.0.55 (Unix) PHP/5.1.6
cn.mail.yahoo.com:
cn.msn.com : Microsoft-IIS/6.0
dict.baidu.com : Apache/2.0.58 (Unix) PHP/4.4.2
download.zol.com.cn: Apache
dzh.mop.com : lighttpd
eladies.sina.com.cn: Apache/2.0.58 (Unix)
file.baidu.com : Apache/2.0.58 (Unix) PHP/4.4.2
finance.sina.com.cn: Apache/2.0.59 (Unix)
flights.ctrip.com: Microsoft-IIS/6.0
fund.eastmoney.com: Microsoft-IIS/6.0
games.sina.com.cn: Apache/2.0.54 (Unix)
geci.baidu.com : apache 1.6.6.0/httpd 1.3.27 (Unix) BAIDU_UENCODE
v1.0.0 mod_gzip/1.3.19.1a mod_image/2.0.1 mod_cache/1.0.0
mod_baidu/4.1.1 mod_baidussa/1.0.0
guba.eastmoney.com: Microsoft-IIS/6.0
hd.www.net.cn : Apache/1.3.26 (Unix) PHP/4.2.2
health.sohu.com: Apache/1.3.37 (Unix) mod_gzip/1.3.26.1a
hi.baidu.com : apache 1.1.16.0
hjsm.tom.com : Apache/1.3.34 (Debian) PHP/5.1.4-0.1
image.baidu.com: apache 1.7.7.0/httpd 1.3.27 (Unix) BAIDU_UENCODE
v1.0.0 mod_gzip/1.3.19.1a mod_image/2.0.1 mod_cache/1.0.0
mod_baidu/4.1.1
junshi.xilu.com: Apache/2.0.59 (Unix) PHP/4.3.10
lady.163.com : Apache/2.0.59 (Unix)
lady.qq.com : Apache
lady.tom.com : Apache/2.2.0 (Unix) DAV/2 PHP/5.1.2
login.mail.sohu.com: Apache/1.3.33 (Unix) Resin/2.0.5 PHP/4.4.1
love21cn.msn.com.cn: Apache
ma.baidu.com : Apache/2.0.52 (Red Hat)
mail.163.com : Apache
mail.sina.com.cn: Apache/2.2.4 (FreeBSD) PHP/5.2.1 with Suhosin-Patch
mail.tom.com : Apache/1.3.31 (Unix)
map.baidu.com : apache 1.2.3.2/httpd 1.3.27 (Unix) mod_gis/1.0.0
mod_xslt/1.0.0 mod_mapurl/1.0.0 mod_gzip/1.3.19.1a mod_cache/1.0.0
mod_baidu/4.1.1 mod_ipcheck/1.0.0
mil.news.sina.com.cn: Apache/2.0.58 (Unix)
military.china.com: Apache
mmscode2.5kcn.com:
mobile.pconline.com.cn: Apache/2.2.3 (Unix) PHP/4.4.5
mobile.pcpop.com: Microsoft-IIS/6.0
mobile.zol.com.cn: Apache
mp3.baidu.com : apache 1.6.6.0/httpd 1.3.27 (Unix) BAIDU_UENCODE
v1.0.0 mod_gzip/1.3.19.1a mod_image/2.0.1 mod_cache/1.0.0
mod_baidu/4.1.1 mod_baidussa/1.0.0
my.51job.com : Apache/1.3.37 (Unix)
news.baidu.com : apache2.0.16.0/1.3.27 (Unix) MOD_NEWSREWRITE v1.0.0
mod_ipcheck/1.0.0 mod_gzip/1.3.19.1a mod_cache/1.0.0 mod_baidu/4.1.1
BAIDU_IMAGE v1.0.2
news.phoenixtv.com: Apache/2.2.3 (Unix)
news.sina.com.cn: Apache/2.0.58 (Unix)
news.sohu.com : Apache/1.3.37 (Unix) mod_gzip/1.3.26.1a
post.baidu.com : apache 2.7.2.0/httpd 1.3.27 (Unix) mod_forum/1.0.0
mod_gzip/1.3.19.1a mod_baidu/4.1.1
quanshiafa.allyes.com: Server
quote.eastmoney.com: Microsoft-IIS/5.0
qzone.qq.com : Apache
service.12530.com: Apache
spaces.live.com: Microsoft-IIS/6.0
spcode.baidu.com: Apache-Coyote/1.1
sports.cctv.com: Sun-ONE-Web-Server/6.1
sports.sina.com.cn: Apache/2.0.59 (Unix)
sports.sohu.com: Apache/1.3.37 (Unix) mod_gzip/1.3.26.1a
sports.tom.com : Apache/2.2.0 (Unix) DAV/2 PHP/5.1.2
tech.sina.com.cn: Apache/2.0.59 (Unix)
tj.28.com : Apache/2.0.54 (Unix) DAV/2 PHP/4.3.6
top.baidu.com : Apache/1.3.29 (Unix) PHP/4.3.4
u.7town.com : Microsoft-IIS/6.0
video.baidu.com: apache 1.0.5.0/httpd 1.3.27 (Unix)
mod_gzip/1.3.19.1a mod_cache/1.0.0 mod_tn/1.0.0 mod_video/1.0.0
mod_ipcheck/1.0.0
weather.tq121.com.cn: Apache/2.0.54 (Unix) PHP/5.0.4
www.126.com : Apache
www.155.com : Apache/2.0.54 (Win32)
www.163.com : Apache/2.0.59 (Unix)
www.17173.com : Apache/2.0.54 (Unix)
www.1860ls.com : Microsoft-IIS/6.0
www.1ting.com : Apache/2.2.3 (Unix) mod_jk/1.2.19
www.21cn.com :
www.3158.cn : Microsoft-IIS/6.0
www.3533.com : Microsoft-IIS/5.0
www.3839.com : Apache
www.39.net : Microsoft-IIS/6.0
www.4399.net : Microsoft-IIS/6.0
www.51.com : Apache
www.51job.com : Apache/1.3.37 (Unix)
www.5460.net : Apache-Coyote/1.1
www.56.com : web server.56
www.6rooms.com : nginx/0.4.9.dev.2
www.96333.com :
www.abchina.com: IBM_HTTP_Server/2.0.47.1 Apache/2.0.47 (Unix)
www.aiting.com : Microsoft-IIS/6.0
www.amazon.cn : Server
www.autohome.com.cn: Microsoft-IIS/6.0
www.babytree.com: Apache
www.baidu.com : BWS/1.0
www.baihe.com : Apache/2.0.59 (Unix)
www.bankcomm.com: IBM_HTTP_SERVER/1.3.28.1 Apache/1.3.28 (Unix)
www.beijing2008.cn: Apache
www.boc.cn : IBM_HTTP_SERVER/1.3.26 Apache/1.3.26 (Unix)
www.bokee.com : Apache/1.3.31 (Unix) mod_gzip/1.3.26.1a
www.caiacai.com: Apache/2.0.59 (Unix) mod_ssl/2.0.59 OpenSSL/0.9.8d PHP/5.2.3
www.ccb.com : Apache/2.0.58 (Unix)
www.cctv.com : Sun-ONE-Web-Server/6.1
www.china.com : Apache
www.chinacars.com: Microsoft-IIS/6.0
www.chinagames.net: Microsoft-IIS/6.0
www.chinamobile.com: Apache
www.chinanews.com.cn: Apache/1.3.36 (Unix)
www.chinaren.com: Apache/1.3.37 (Unix) mod_gzip/1.3.26.1a
www.cjol.com : Microsoft-IIS/6.0
www.cmbchina.com:
www.cmfu.com : Microsoft-IIS/6.0
www.cnfol.com : Apache
www.crsky.com : Microsoft-IIS/6.0
www.ctrip.com : Microsoft-IIS/6.0
www.dangdang.com: Microsoft-IIS/6.0
www.dianping.com: Microsoft-IIS/6.0
www.disney.com.cn: Apache
www.donews.com : Microsoft-IIS/6.0
www.eachnet.com: Apache/2.2.0 (Linux/SUSE)
www.eastmoney.com: Microsoft-IIS/6.0
www.f130.net : Microsoft-IIS/6.0
www.fh21.com.cn: Apache/1.3.34 (Unix) mod_gzip/1.3.26.1a PHP/4.3.11
www.flash8.net : Microsoft-IIS/6.0
www.flowercn.com: Microsoft-IIS/6.0
www.game.com.cn: lighttpd/1.4.15
www.ganji.com : Apache/2.0.55 (Unix) PHP/5.0.5
www.google.cn : GWS/2.1
www.gov.cn : Apache
www.gznet.com : Apache/2.0.49 (Unix)
www.hao123.com : Apache/2.2.4 (Unix) PHP/5.1.4
www.hd315.gov.cn: Microsoft-IIS/5.0
www.hongxiu.com: Microsoft-IIS/6.0
www.hotmail.com: Microsoft-IIS/6.0
www.hunantv.com: Apache/2.0.54 (Unix) PHP/4.4.1
www.icbc.com.cn: Microsoft-IIS/5.0
www.imobile.com.cn: Apache/1.3.37 (Unix) mod_gzip/1.3.26.1a
www.ip138.com : Microsoft-IIS/6.0
www.jrj.com.cn : Microsoft-IIS/6.0
www.ku6.com : Apache
www.lottery.gov.cn: Microsoft-IIS/6.0
www.love21cn.com: Apache
www.marry5.com : lighttpd/1.5.0
www.mydrivers.com:
www.no5.com.cn : Microsoft-IIS/6.0
www.online.sh.cn: Apache/1.3.26 (Unix) mod_gzip/1.3.19.1a
www.onlinedown.net: Microsoft-IIS/6.0
www.openv.tv : Apache
www.ouou.com : lighttpd/1.4.11
www.pcauto.com.cn: Apache/2.2.3 (Unix) PHP/4.4.5
www.pcgames.com.cn: Apache/2.2.3 (Unix) PHP/4.4.5
www.pconline.com.cn: Apache/2.2.3 (Unix) PHP/4.4.5
www.people.com.cn: Apache/1.3.37 (Unix)
www.phoenixtv.com: Apache/2.2.3 (Unix)
www.qq.com : Apache
www.qq163.com : Microsoft-IIS/6.0
www.qunar.com : Apache/2.2.3 (Unix) mod_jk/1.2.18
www.rayli.com.cn: Apache
www.readnovel.com: Apache/2.2.3 (Debian) PHP/4.4.4-8+etch4
www.reuters.com.cn: Microsoft-IIS/5.0
www.rising.com.cn: Microsoft-IIS/6.0
www.rongshuxia.com: Apache/1.3.37 (Unix)
www.sina.com.cn: Apache/2.0.54 (Unix)
www.skycn.com : Who_knows?
www.sogou.com : Apache/2.0.55 (Unix)
www.sogua.com :
www.sohu.com : Apache/1.3.37 (Unix) mod_gzip/1.3.26.1a
www.sooe.cn : Apache/2.0.59 (Unix) DAV/2 PHP/5.2.1
www.spjoy.com : Microsoft-IIS/5.0
www.stockstar.com: Microsoft-IIS/6.0
www.taobao.com : Apache
www.tianya.cn : Microsoft-IIS/5.0
www.tiexue.net : Microsoft-IIS/6.0
www.tom.com : Apache/1.3.34 (Debian) PHP/5.1.2-1
www.wuhan.net.cn:
www.xcar.com.cn: Apache
www.xiaoyouxi.com: Microsoft-IIS/6.0
www.xinhuanet.com: Apache
www.xxsy.net : Microsoft-IIS/6.0
www.xywy.com : Apache/2.2.3 (Unix) DAV/2 PHP/5.1.6
www.yahoo.cn : Apache
www.yaolan.com :
www.youku.com : Apache
www.younet.com : Apache/1.3.29 (Unix) PHP/4.3.4
www.yymp3.com : Microsoft-IIS/6.0
www.zaobao.com : Apache
www.zhaopin.com: Apache/1.3.37 (Unix)
www.zhcw.com : Apache/2.0.55 (Unix) DAV/2
zhidao.baidu.com: apache 1.0.10.0
***************
SERVER STAT
***************
Apache : 105 55.85%
IIS : 48 25.53%
GWS : 1 0.53%
Others : 34 18.09%
用Perl寫了個分析工具,從hao123網站首頁提取出鏈接,然後向每個鏈接的伺服器發送head請求,分析響應報文頭,確定是用哪種伺服器.從數據來看,Apache還是web伺服器的首選.
程序見附件webanalyse.pl, 以GPL方式發布.
[ 本帖最後由 3645636 於 2008-6-25 11:18 編輯 ]
《解決方案》
#!/usr/bin/perl
# Author: Chen Xueqin <robbiecn@gmail.com>
# Create Day: 2006-09-07 V 0.1
# License: GPL
# ChangeLog:
# V0.2 Use array to specify loop sequence
#
require 5.002;
use LWP::Simple;
use Chart::Pie;
use GD;
use URI;
use HTML::LinkExtor;
use Data::Dump qw(dump);
#use strict;
my @types = (
'Apache',
'IIS',
'GWS',
'Others'
);
my %types_pattern = (
'Apache' => 'Apache',
'IIS' => '.+IIS.+',
'GWS' => 'GWS',
'Others' => '.+|^$' ,
);
my %stats = (
'Apache' => 0,
'IIS' => 0,
'Others' => 0,
'GWS' => 0,
'ALL' => 0,
);
my %sample_sites = (
'sina' => "http://www.sina.com.cn",
'google' => "http://www.google.com",
'apache' => "http://www.apache.org",
'kernel' => "http://www.kernel.org",
'yahoo' => "http://www.yahoo.com.cn",
'hotmail' => "http://www.hotmail.com",
'zaobao' => "http://www.zaobao.com",
'linux' => "http://www.linux.com",
'sohu' => "http://www.sohu.com",
'chinaren' => "http://www.chinaren.com",
'whitehouse' => "http://www.whitehouse.org",
'china' => "http://www.china.com",
'chinatelcom' => "http://www.chinatelecom.com.cn",
'csdn' => "http://www.csdn.net",
'star-net' => "http://www.star-net.cn",
'linuxtoday' => "http://www.linuxtoday.com",
'gentoo' => "http://www.gentoo.org",
'microsoft' => "http://www.microsoft.com", 'linuxforum' => "http://www.linuxforum.net",
'gmail' => "http://www.gmail.com",
'fzedu' => "http://www.fzu.edu.cn",
'amazon' => "http://www.amazon.com",
'slashdot' => "http://slashdot.org",
'ebay' => "http://www.ebay.com",
'alibaba' => "http://china.alibaba.com",
'chinaunix' => "http://www.chinaunix.net",
'smtel' => "http://218.67.79.7",
'fzbm' => "http://www.fzbm.com",
);
my %site_server = ();
# EXTRACT LINKS
sub get_sitename {
my $uri = shift;
my $u = URI->new($uri);
return $u->authority;
}
sub extract_link {
my $source = shift;
my $ext = new HTML::LinkExtor;
$ext->parse(get($source));
my @links = $ext->links();
#print dump(@links);
my $size = scalar @links;
my %stat = ();
my %sites = ();
my $i;
for ($i=0;$i<$size;$i++) {
my $type = $links[$i];
$stat{$type}++;
if ($type =~ /^a$/) {
my $url = $links[$i];
if ( $url =~ /^http/ ) {
my $authority = get_sitename($url);
$sites{$authority} = $url;
}
}
}
return \%sites;
}
# ANALYSE
sub do_analyse {
# 娉ㄦ剰: 鍙橀噺$sites鏄紩鐢?涓嶆槸鏍囬噺
my $sites = shift;
foreach my $site_name ( keys %$sites) {
printf("Analysing %s\n", $site_name);
# 瀵逛簬hash寮曠敤,涓嶅悓浜庢爣閲忓啓娉?$h{key},姝g『鐨勫啓娉曟槸 ${$href}{key}
my $site = ${$sites}{$site_name};
my($content_type, $document_length, $modified_time, $expires, $server) = head(${$sites}{$site_name});
$site_server{$site_name} = $server;
foreach my $type (@types) {
if ( $server =~ /$types_pattern{$type}/ ) {
$stats{$type} += 1;
last;
}
}
}
#$stats{'Others'} = $stats{'ALL'};
foreach my $type (@types) {
if ( $type =~ /ALL/ ) {
next;
}
$stats{'ALL'} += $stats{$type};
}
}
=pod
sub do_analyse {
foreach my $site_name (keys %sites) {
printf("Analysing %s\n", $site_name);
my($content_type, $document_length, $modified_time, $expires, $server) = head($sites{$site_name});
$site_server{$site_name} = $server;
foreach my $type (@types) {
if ( $server =~ /$types_pattern{$type}/ ) {
$stats{$type} += 1;
last;
}
}
}
#$stats{'Others'} = $stats{'ALL'};
foreach my $type (@types) {
if ( $type =~ /ALL/ ) {
next;
}
$stats{'ALL'} += $stats{$type};
}
}
=cut
# REPORT
sub output_raw_data {
report_title("SERVER RAW DATA");
foreach my $site (sort keys %site_server) {
printf("%-15s: %s\n", $site, $site_server{$site} );
}
}
sub output_stat {
report_title("SERVER STAT");
foreach my $server (@types) {
my $all = $stats{'ALL'};
my $s = $stats{$server};
my $p = 100*$s/$all;
printf("%-15s: %-5d %5.2f%%\n", $server, $s, $p );
}
}
sub report_title {
my $title = shift;
for (1..15) {
printf("*");
}
printf("\n%s\n", $title);
for (my $i=0;$i<20;$i++) {
printf("%c", '*');
}
for (1..15) {
printf("*");
}
printf("\n");
}
sub output_pie {
my $g = Chart::Pie->new(500,450);
my @pie_stats = ();
my $i = 0;
$g->add_dataset(@types);
foreach my $server (@types) {
$pie_stats[$i] = $stats{$server};
$i++;
}
$g->add_dataset(@pie_stats);
$g->set ('title' => 'Webserver Graphics Stat');
$g->set ('sub_title' => 'Apache&IIS&MISC');
$g->set ('label_values' => 'percent');
$g->set ('legend_label_values' => 'value');
$g->set ('legend' => 'bottom');
$g->set ('grey_background' => 'false');
$g->set ('x_label' => '');
$g->set ('legend_font' => gdSmallFont);
$g->set ('title_font' => gdGiantFont);
$g->png ("wspie.png");
}
# START HERE
my $sites = extract_link("http://www.hao123.com");
do_analyse($sites);
output_raw_data;
output_stat;
output_pie;