python爬虫抓取站长之家IP库,单线程的,仅供练习,IP库数据有43亿条,如果按此种方法抓取至少得数年,所以谨以此作为练手,新手代码很糙,请大家见谅。
#!/usr/bin/python#coding=UTF-8import urllib2import reimport osimport csvimport codecsuser_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'headers = { 'User-Agent' : user_agent }def gethtml(url): user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent } request=urllib2.Request(url,headers=headers) response=urllib2.urlopen(request) html=response.read() return htmldef getdate(html): r=r' (.*?)\s* ' rs=re.compile(r,re.S) iplace=rs.findall(html) return iplacedef getdataid(html): r=r' (\d+?)' rs=re.compile(r) dataid=rs.findall(html) return dataiddef geturl(ip): url='http://ip.chinaz.com/%s'%ip return urlcount=1with open('ipku.csv','a+') as csvfile: csvfile.write(codecs.BOM_UTF8) spamwriter=csv.writer(csvfile,dialect='excel') spamwriter.writerow(['数字地址','IP地址','服务器地址']) for k in xrange(211,256): for v in xrange(0,256): for m in xrange(10,256): for n in xrange(0,256): ip=str(k)+'.'+str(v)+'.'+str(m)+'.'+str(n) url=geturl(ip) html=gethtml(url) dataid=getdataid(html) iplace=getdate(html) for h in dataid: print count count+=1 spamwriter.writerow([h,ip,iplace[1]])
打印count是因为让大家看到进程在运行,没特殊意义。