#!/usr/bin/python
import threading
from time import ctime,sleep
import pycurl
import urllib2
import sys,os
import StringIO
from lxml import etree
import datetime
import hashlib
starttime = datetime.datetime.now()
def testf():
c = pycurl.Curl()
c.setopt(pycurl.URL, "http://www.weituanpin.com/")
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.perform()
html = b.getvalue();
print html;
#sleep(500)
#print b.getvalue()
def urllibget(i,j):
response = urllib2.urlopen(i)
html = response.read()
#write_file(html,str(j))
show_pach(html,i)
def show_pach(html,url):
tree=etree.HTML(html)
nodes=tree.xpath(u"/html")
nodes=nodes[0].xpath("http://img[@src]")
for n in nodes:
src = n.attrib["src"]
if src.find("http") == -1:
src = url + src
download_img(src)
def write_file(html,file):
fsock = open("file/"+file+".txt", "a")
fsock.write(html)
fsock.close()
print file + " is OK\n"
def download_img(url):
response = urllib2.urlopen(url)
html = response.read()
ms=hashlib.md5();
ms.update(url)
ms.hexdigest()
fk = open("file/nimabi"+ms.hexdigest()+".jpg", "wb")
fk.write(html)
print url
fk.close()
a = ('http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.weituanpin.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.lxweimin.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.lxweimin.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.lxweimin.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.lxweimin.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.lxweimin.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.lxweimin.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.lxweimin.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.lxweimin.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.lxweimin.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.lxweimin.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.lxweimin.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.lxweimin.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.lxweimin.com/',
'http://www.lxweimin.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.lxweimin.com/',
'http://www.lxweimin.com/',
'http://www.xiachufang.com/',
'http://www.163.com/')
c = ("http://2mn.tv","http://2mn.tv")
j = 0
for i in c:
j = j + 1
t = threading.Thread(target=urllibget,args=(i,j,))
t.start()
if j == 3:
break
#https pycurl 防止腳本超時
def testf_https():
c = pycurl.Curl()
c.setopt(pycurl.URL, "https://detail.1688.com/offer/528970869962.html?spm=a312h.7841636.1998813769.d_pic_14.Cm06wt&tracelog=p4p")
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.SSL_VERIFYPEER, 0)
c.setopt(pycurl.SSL_VERIFYHOST, 0)
c.perform()
html = b.getvalue()
#print html
show_pach_https( html,"https://detail.1688.com/offer/528970869962.html?spm=a312h.7841636.1998813769.d_pic_14.Cm06wt&tracelog=p4p")
def show_pach_https(html,url):
tree=etree.HTML(html)
nodes=tree.xpath(u"/html/body")
nodes=nodes[0].xpath("http://img")
for n in nodes:
src = n.attrib["src"]
if src.find("http") == -1:
src = "http:"+src
else:
src = src
print src
download_img_https(src)
def download_img_https(url):
c = pycurl.Curl()
c.setopt(pycurl.URL, url)
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.SSL_VERIFYPEER, 0)
c.setopt(pycurl.SSL_VERIFYHOST, 0)
c.perform()
html = b.getvalue()
#print html
ms=hashlib.md5()
ms.update(url)
ms.hexdigest()
fk = open("file/nimabi"+ms.hexdigest()+".jpg", "wb")
fk.write(html)
print url
fk.close()
endtime = datetime.datetime.now()
print (endtime - starttime).seconds
python pycurl lxml threading 抓取數據
最后編輯于 :
?著作權歸作者所有,轉載或內容合作請聯系作者
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發布,文章內容僅代表作者本人觀點,簡書系信息發布平臺,僅提供信息存儲服務。
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發布,文章內容僅代表作者本人觀點,簡書系信息發布平臺,僅提供信息存儲服務。
推薦閱讀更多精彩內容
- 目標 多線程數據抓取-58同城轉轉網的二手產品 實作 1. 建立一個項目 新建一個項目58tongcheng1 ...
- 作者:xlzd 鏈接:https://zhuanlan.zhihu.com/p/20435541 來源:知乎 著作...
- 閑來無事,看看了Python,發現這東西挺爽的,廢話少說,就是干 準備搭建環境因為是MAC電腦,所以自動安裝了Py...
- Summer 2015 Report #026 - 05/07/15 Jianjian Wu 1. Plan fo...