今天博客開始繼續更新,謝謝大家對我的關注和支持。這幾天一直是在寫一個ip代理池的開源項目。通過前幾篇的博客,我們可以了解到突破反爬蟲機制的一個重要舉措就是代理ip。擁有龐大穩定的ip代理,在爬蟲工作中將起到重要的作用,但是從成本的角度來說,一般穩定的ip池都很貴,因此我這個開源項目的意義就誕生了,爬取一些代理網站提供的免費ip(雖然70%都是不好使的,但是扛不住量大,網站多),檢測有效性后存儲到數據庫中,同時搭建一個http服務器,提供一個api接口,供大家的爬蟲程序調用。(我的新書《Python爬蟲開發與項目實戰》發布了,大家在這里可以看到樣章)
好了,廢話不多說,咱們進入今天的主題,講解一下我寫的這個開源項目IPProxys。
下面是這個項目的工程結構:
api包:主要是實現http服務器,提供api接口(通過get請求,返回json數據)
data文件夾:主要是數據庫文件的存儲位置和qqwry.dat(可以查詢ip的地理位置)
db包:主要是封裝了一些數據庫的操作
spider包:主要是爬蟲的核心功能,爬取代理網站上的代理ip
test包:測試一些用例,不參與整個項目的運行
util包:提供一些工具類。IPAddress.py查詢ip的地理位置
validator包:用來測試ip地址是否可用
config.py:主要是配置信息(包括配置ip地址的解析方式和數據庫的配置)
接下來講一下關鍵代碼:
首先說一下apiServer.py:
#coding:utf-8
'''
定義幾個關鍵字,count types,protocol,country,area,
'''
import urllib
from config import API_PORT
from db.SQLiteHelper import SqliteHelper
__author__ = 'Xaxdus'
import BaseHTTPServer
import json
import urlparse
# keylist=['count', 'types','protocol','country','area']
class WebRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
def do_GET(self):
"""
"""
dict={}
parsed_path = urlparse.urlparse(self.path)
try:
query = urllib.unquote(parsed_path.query)
print query
if query.find('&')!=-1:
params = query.split('&')
for param in params:
dict[param.split('=')[0]]=param.split('=')[1]
else:
dict[query.split('=')[0]]=query.split('=')[1]
str_count=''
conditions=[]
for key in dict:
if key =='count':
str_count = 'lIMIT 0,%s'% dict[key]
if key =='country' or key =='area':
conditions .append(key+" LIKE '"+dict[key]+"%'")
elif key =='types' or key =='protocol' or key =='country' or key =='area':
conditions .append(key+"="+dict[key])
if len(conditions)>1:
conditions = ' AND '.join(conditions)
else:
conditions =conditions[0]
sqlHelper = SqliteHelper()
result = sqlHelper.select(sqlHelper.tableName,conditions,str_count)
# print type(result)
# for r in result:
# print r
print result
data = json.dumps(result)
self.send_response(200)
self.end_headers()
self.wfile.write(data)
except Exception,e:
print e
self.send_response(404)
if __name__=='__main__':
server = BaseHTTPServer.HTTPServer(('0.0.0.0',API_PORT), WebRequestHandler)
server.serve_forever()
從代碼中可以看出是對參數的解析,參數包括count(數量), types(模式),protocol(協議),country(國家),area(地區),(
types類型(0高匿名,1透明),protocol(0 http,1 https http),country(國家),area(省市))例如訪問http://127.0.0.1:8000/?count=8&types=0.返回json數據。如下圖所示:
接著說一下SQLiteHelper.py(主要是對sqlite的操作):
#coding:utf-8
from config import DB_CONFIG
from db.SqlHelper import SqlHelper
__author__ = 'Xaxdus'
import sqlite3
class SqliteHelper(SqlHelper):
tableName='proxys'
def __init__(self):
'''
建立數據庫的鏈接
:return:
'''
self.database = sqlite3.connect(DB_CONFIG['dbPath'],check_same_thread=False)
self.cursor = self.database.cursor()
#創建表結構
self.createTable()
def createTable(self):
self.cursor.execute("create TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY ,ip VARCHAR(16) NOT NULL,"
"port INTEGER NOT NULL ,types INTEGER NOT NULL ,protocol INTEGER NOT NULL DEFAULT 0,"
"country VARCHAR (20) NOT NULL,area VARCHAR (20) NOT NULL,updatetime TimeStamp NOT NULL DEFAULT (datetime('now','localtime')) ,speed DECIMAL(3,2) NOT NULL DEFAULT 100)"% self.tableName)
self.database.commit()
def select(self,tableName,condition,count):
'''
:param tableName: 表名
:param condition: 條件包含占位符
:param value: 占位符所對應的值(主要是為了防注入)
:return:
'''
command = 'SELECT DISTINCT ip,port FROM %s WHERE %s ORDER BY speed ASC %s '%(tableName,condition,count)
self.cursor.execute(command)
result = self.cursor.fetchall()
return result
def selectAll(self):
self.cursor.execute('SELECT DISTINCT ip,port FROM %s ORDER BY speed ASC '%self.tableName)
result = self.cursor.fetchall()
return result
def selectCount(self):
self.cursor.execute('SELECT COUNT( DISTINCT ip) FROM %s'%self.tableName)
count = self.cursor.fetchone()
return count
def selectOne(self,tableName,condition,value):
'''
:param tableName: 表名
:param condition: 條件包含占位符
:param value: 占位符所對應的值(主要是為了防注入)
:return:
'''
self.cursor.execute('SELECT DISTINCT ip,port FROM %s WHERE %s ORDER BY speed ASC'%(tableName,condition),value)
result = self.cursor.fetchone()
return result
def update(self,tableName,condition,value):
self.cursor.execute('UPDATE %s %s'%(tableName,condition),value)
self.database.commit()
def delete(self,tableName,condition):
'''
:param tableName: 表名
:param condition: 條件
:return:
'''
deleCommand = 'DELETE FROM %s WHERE %s'%(tableName,condition)
# print deleCommand
self.cursor.execute(deleCommand)
self.commit()
def commit(self):
self.database.commit()
def insert(self,tableName,value):
proxy = [value['ip'],value['port'],value['type'],value['protocol'],value['country'],value['area'],value['speed']]
# print proxy
self.cursor.execute("INSERT INTO %s (ip,port,types,protocol,country,area,speed)VALUES (?,?,?,?,?,?,?)"% tableName
,proxy)
def batch_insert(self,tableName,values):
for value in values:
if value!=None:
self.insert(self.tableName,value)
self.database.commit()
def close(self):
self.cursor.close()
self.database.close()
if __name__=="__main__":
s = SqliteHelper()
print s.selectCount()[0]
# print s.selectAll()
HtmlPraser.py(主要是對html進行解析):
使用lxml的xpath進行解析
#coding:utf-8
import datetime
from config import QQWRY_PATH, CHINA_AREA
from util.IPAddress import IPAddresss
from util.logger import logger
__author__ = 'Xaxdus'
from lxml import etree
class Html_Parser(object):
def __init__(self):
self.ips = IPAddresss(QQWRY_PATH)
def parse(self,response,parser):
'''
:param response: 響應
:param type: 解析方式
:return:
'''
if parser['type']=='xpath':
proxylist=[]
root = etree.HTML(response)
proxys = root.xpath(parser['pattern'])
for proxy in proxys:
# print parser['postion']['ip']
ip = proxy.xpath(parser['postion']['ip'])[0].text
port = proxy.xpath(parser['postion']['port'])[0].text
type = proxy.xpath(parser['postion']['type'])[0].text
if type.find(u'高匿')!=-1:
type = 0
else:
type = 1
protocol=''
if len(parser['postion']['protocol']) > 0:
protocol = proxy.xpath(parser['postion']['protocol'])[0].text
if protocol.lower().find('https')!=-1:
protocol = 1
else:
protocol = 0
else:
protocol = 0
addr = self.ips.getIpAddr(self.ips.str2ip(ip))
country = ''
area = ''
if addr.find(u'省')!=-1 or self.AuthCountry(addr):
country = u'中國'
area = addr
else:
country = addr
area = ''
# updatetime = datetime.datetime.now()
# ip,端口,類型(0高匿名,1透明),protocol(0 http,1 https http),country(國家),area(省市),updatetime(更新時間)
# proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100}
proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'speed':100}
print proxy
proxylist.append(proxy)
return proxylist
def AuthCountry(self,addr):
'''
用來判斷地址是哪個國家的
:param addr:
:return:
'''
for area in CHINA_AREA:
if addr.find(area)!=-1:
return True
return False
IPAddresss.py(通過讀取純真qqwry.dat,對ip地址進行定位),讀取的方式可以參考:http://ju.outofmemory.cn/entry/85998;https://linuxtoy.org/archives/python-ip.html
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import socket
import struct
class IPAddresss:
def __init__(self, ipdbFile):
self.ipdb = open(ipdbFile, "rb")
str = self.ipdb.read(8)
(self.firstIndex, self.lastIndex) = struct.unpack('II', str)
self.indexCount = (self.lastIndex - self.firstIndex)/7+1
# print self.getVersion(), u" 紀錄總數: %d 條 "%(self.indexCount)
def getVersion(self):
s = self.getIpAddr(0xffffff00L)
return s
def getAreaAddr(self, offset=0):
if offset:
self.ipdb.seek(offset)
str = self.ipdb.read(1)
(byte,) = struct.unpack('B', str)
if byte == 0x01 or byte == 0x02:
p = self.getLong3()
if p:
return self.getString(p)
else:
return ""
else:
self.ipdb.seek(-1, 1)
return self.getString(offset)
def getAddr(self, offset, ip=0):
self.ipdb.seek(offset + 4)
countryAddr = ""
areaAddr = ""
str = self.ipdb.read(1)
(byte,) = struct.unpack('B', str)
if byte == 0x01:
countryOffset = self.getLong3()
self.ipdb.seek(countryOffset)
str = self.ipdb.read(1)
(b,) = struct.unpack('B', str)
if b == 0x02:
countryAddr = self.getString(self.getLong3())
self.ipdb.seek(countryOffset + 4)
else:
countryAddr = self.getString(countryOffset)
areaAddr = self.getAreaAddr()
elif byte == 0x02:
countryAddr = self.getString(self.getLong3())
areaAddr = self.getAreaAddr(offset + 8)
else:
countryAddr = self.getString(offset + 4)
areaAddr = self.getAreaAddr()
return countryAddr + " " + areaAddr
def dump(self, first , last):
if last > self.indexCount :
last = self.indexCount
for index in range(first, last):
offset = self.firstIndex + index * 7
self.ipdb.seek(offset)
buf = self.ipdb.read(7)
(ip, of1, of2) = struct.unpack("IHB", buf)
address = self.getAddr(of1 + (of2 << 16))
# 把GBK轉為utf-8
address = unicode(address, 'gbk').encode("utf-8")
print "%d\t%s\t%s" % (index, self.ip2str(ip), address)
def setIpRange(self, index):
offset = self.firstIndex + index * 7
self.ipdb.seek(offset)
buf = self.ipdb.read(7)
(self.curStartIp, of1, of2) = struct.unpack("IHB", buf)
self.curEndIpOffset = of1 + (of2 << 16)
self.ipdb.seek(self.curEndIpOffset)
buf = self.ipdb.read(4)
(self.curEndIp,) = struct.unpack("I", buf)
def getIpAddr(self, ip):
L = 0
R = self.indexCount - 1
while L < R-1:
M = (L + R) / 2
self.setIpRange(M)
if ip == self.curStartIp:
L = M
break
if ip > self.curStartIp:
L = M
else:
R = M
self.setIpRange(L)
# version information, 255.255.255.X, urgy but useful
if ip & 0xffffff00L == 0xffffff00L:
self.setIpRange(R)
if self.curStartIp <= ip <= self.curEndIp:
address = self.getAddr(self.curEndIpOffset)
# 把GBK轉為utf-8
address = unicode(address, 'gbk')
else:
address = u"未找到該IP的地址"
return address
def getIpRange(self, ip):
self.getIpAddr(ip)
range = self.ip2str(self.curStartIp) + ' - ' \
+ self.ip2str(self.curEndIp)
return range
def getString(self, offset = 0):
if offset :
self.ipdb.seek(offset)
str = ""
ch = self.ipdb.read(1)
(byte,) = struct.unpack('B', ch)
while byte != 0:
str += ch
ch = self.ipdb.read(1)
(byte,) = struct.unpack('B', ch)
return str
def ip2str(self, ip):
return str(ip >> 24)+'.'+str((ip >> 16) & 0xffL)+'.'+str((ip >> 8) & 0xffL)+'.'+str(ip & 0xffL)
def str2ip(self, s):
(ip,) = struct.unpack('I', socket.inet_aton(s))
return ((ip >> 24) & 0xffL) | ((ip & 0xffL) << 24) | ((ip >> 8) & 0xff00L) | ((ip & 0xff00L) << 8)
def getLong3(self, offset=0):
if offset:
self.ipdb.seek(offset)
str = self.ipdb.read(3)
(a, b) = struct.unpack('HB', str)
return (b << 16) + a
最后看一下validator.py,由于使用的是python2.7,所以要使用協程采用了gevent:
#coding:utf-8
import datetime
from gevent.pool import Pool
import requests
import time
from config import TEST_URL
import config
from db.SQLiteHelper import SqliteHelper
from gevent import monkey
monkey.patch_all()
__author__ = 'Xaxdus'
class Validator(object):
def __init__(self):
self.detect_pool = Pool(config.THREADNUM)
def __init__(self,sqlHelper):
self.detect_pool = Pool(config.THREADNUM)
self.sqlHelper =sqlHelper
def run_db(self):
'''
從數據庫中檢測
:return:
'''
try:
#首先將超時的全部刪除
self.deleteOld()
#接著將重復的刪除掉
#接著檢測剩余的ip,是否可用
results = self.sqlHelper.selectAll()
self.detect_pool.map(self.detect_db,results)
return self.sqlHelper.selectCount()#返回最終的數量
except Exception,e:
print e
return 0
def run_list(self,results):
'''
這個是先不進入數據庫,直接從集合中刪除
:param results:
:return:
'''
# proxys=[]
# for result in results:
proxys = self.detect_pool.map(self.detect_list,results)
#這個時候proxys的格式是[{},{},{},{},{}]
return proxys
def deleteOld(self):
'''
刪除舊的數據
:return:
'''
condition = "updatetime<'%s'"%((datetime.datetime.now() - datetime.timedelta(minutes=config.MAXTIME)).strftime('%Y-%m-%d %H:%M:%S'))
self.sqlHelper.delete(SqliteHelper.tableName,condition)
def detect_db(self,result):
'''
:param result: 從數據庫中檢測
:return:
'''
ip = result[0]
port = str(result[1])
proxies={"http": "http://%s:%s"%(ip,port)}
start = time.time()
try:
r = requests.get(url=TEST_URL,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)
if not r.ok:
condition = "ip='"+ip+"' AND "+'port='+port
print 'fail ip =%s'%ip
self.sqlHelper.delete(SqliteHelper.tableName,condition)
else:
speed = round(time.time()-start, 2)
self.sqlHelper.update(SqliteHelper.tableName,'SET speed=? WHERE ip=? AND port=?',(speed,ip,port))
print 'success ip =%s,speed=%s'%(ip,speed)
except Exception,e:
condition = "ip='"+ip+"' AND "+'port='+port
print 'fail ip =%s'%ip
self.sqlHelper.delete(SqliteHelper.tableName,condition)
def detect_list(self,proxy):
'''
:param proxy: ip字典
:return:
'''
# for proxy in proxys:
ip = proxy['ip']
port = proxy['port']
proxies={"http": "http://%s:%s"%(ip,port)}
start = time.time()
try:
r = requests.get(url=TEST_URL,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)
if not r.ok:
print 'fail ip =%s'%ip
proxy = None
else:
speed = round(time.time()-start,2)
print 'success ip =%s,speed=%s'%(ip,speed)
proxy['speed']=speed
# return proxy
except Exception,e:
print 'fail ip =%s'%ip
proxy = None
return proxy
# return proxys
if __name__=='__main__':
# v = Validator()
# results=[{'ip':'192.168.1.1','port':80}]*10
# results = v.run(results)
# print results
pass
最后咱們看一下運行效果: 切換到工程目錄下,cmd中執行python IPProxys.py:
這個時候咱們在瀏覽器中輸入請求,就會返回響應的結果:
執行流程是每隔半小時檢測一下數據庫中ip地址的有效性,刪除無效的代理ip。如果ip地址數量少于一個數值,爬蟲將會啟動,進行新一輪的爬取。當然檢測時間和數據量都可以在config.py中配置。咱們看一下config.py的部分代碼,大家就明白了:
'''
數據庫的配置
'''
DB_CONFIG={
'dbType':'sqlite',#sqlite,mysql,mongodb
'dbPath':'./data/proxy.db',#這個僅僅對sqlite有效
'dbUser':'',#用戶名
'dbPass':'',#密碼
'dbName':''#數據庫名稱
}
CHINA_AREA=[u'河北',u'山東',u'遼寧',u'黑龍江',u'吉林'
,u'甘肅',u'青海',u'河南',u'江蘇',u'湖北',u'湖南',
u'江西',u'浙江',u'廣東',u'云南',u'福建',
u'臺灣',u'海南',u'山西',u'四川',u'陜西',
u'貴州',u'安徽',u'重慶',u'北京',u'上海',u'天津',u'廣西',u'內蒙',u'西藏',u'新疆',u'寧夏',u'香港',u'澳門']
QQWRY_PATH="./data/qqwry.dat"
THREADNUM = 20
API_PORT=8000
'''
爬蟲爬取和檢測ip的設置條件
不需要檢測ip是否已經存在,因為會定時清理
'''
UPDATE_TIME=30*60#每半個小時檢測一次是否有代理ip失效
MINNUM = 500 #當有效的ip值小于一個時 需要啟動爬蟲進行爬取
MAXTIME = 24*60 #當爬取存儲開始一直使用的最大時間,如果超過這個時間,都刪除
TIMEOUT = 5#socket延時
'''
反爬蟲的設置
'''
'''
重試次數
'''
RETRY_TIME=3
'''
USER_AGENTS 隨機頭信息
'''
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
]
HEADER = {
'User-Agent': random.choice(USER_AGENTS),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate',
}
TEST_URL='http://www.ip138.com/'
整個項目的代碼很簡單,大家如果想深入了解的話,就詳細的看一下我的這個開源項目IPProxys代碼,代碼寫的有點粗糙,日后再繼續優化。
完整的代碼我已經上傳到github上:https://github.com/qiyeboy/IPProxys
qqwry.dat下載鏈接:http://pan.baidu.com/s/1o7A6n8m 密碼:wcvs。
今天的分享就到這里,如果大家覺得還可以呀,記得贊賞呦。