# -*- coding: utf-8 -*-
import re
import csv
import scrapy
import redis
redis_cli = redis.Redis(host='127.0.0.1',port=6379)
class SearchSpider(scrapy.Spider):
name = 'search'
# num = 0
allowed_domains = ['www.tianyancha.com']
start_urls = ['http://www.tianyancha.com/']
dr = re.compile(r'<[^>]+>', re.S)
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Host':'www.tianyancha.com',
'Referer':'www.tianyancha.com',
}
cookies='aliyungf_tc=AQAAAP6muX9DyAsAtlH3Oqxg8RT+o/Ue; csrfToken=JgvbpACNlq9x03jd8-YRyQpo; TYCID=4eed97a0de4a11e8a64ebdb872e198de; undefined=4eed97a0de4a11e8a64ebdb872e198de; ssuid=425167980; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1541127139; _ga=GA1.2.1870685403.1541127140; _gid=GA1.2.857435514.1541127140; token=8463eed85217427b92358c2828d546b2; _utm=4fe67a08ebed4970858bf216a50c247d; tyc-user-info=%257B%2522myQuestionCount%2522%253A%25220%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522state%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522onum%2522%253A%25220%2522%252C%2522monitorUnreadCount%2522%253A%252258%2522%252C%2522discussCommendCount%2522%253A%25220%2522%252C%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNzYyMTk4OTkyMyIsImlhdCI6MTU0MTEyNzE2NSwiZXhwIjoxNTU2Njc5MTY1fQ.uBNhmJ563KfA6tyJAk-pc54yGYThirDuKDBPStBzHHvAEYJ3gQ4lnDLxugKJzQ0enXXs59uESKjTPXtRQ65LrQ%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522pleaseAnswerCount%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522bizCardUnread%2522%253A%25220%2522%252C%2522mobile%2522%253A%252217621989923%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNzYyMTk4OTkyMyIsImlhdCI6MTU0MTEyNzE2NSwiZXhwIjoxNTU2Njc5MTY1fQ.uBNhmJ563KfA6tyJAk-pc54yGYThirDuKDBPStBzHHvAEYJ3gQ4lnDLxugKJzQ0enXXs59uESKjTPXtRQ65LrQ; RTYCID=6fd9880e3de246acbff6beaedbc9ec77; CT_TYCID=f41090929ec847f8b63b73608a8cbd0b; bannerFlag=true; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1541131237; _gat_gtag_UA_123487620_1=1; cloud_token=3d67526240bf4f3c9aaff7f004d8e522; cloud_utm=1bd3d1e102e441d39f7560aaef75b87e'
cookie={}
# f = open('C:tianyancha_car.txt', 'w', encoding='utf-8')
for c in cookies.split(';'):
cookie[c.split('=')[0]]=c.split('=')[1]
def start_requests(self):
# for id in idlist:
with open('/Users/admin/Downloads/tianyancha/tianyancha/needs.csv')as g:
reader = csv.reader(g)
num = 0
for row in reader:
num+=1
id = row[3]
# 爬過(guò)的去重
reuslt = redis_cli.sismember('tianyancha', id)
# print(reuslt)
if not reuslt:
print(row)
meta={'oldinfo':row,'num':num}
yield scrapy.Request(url='https://www.tianyancha.com/search?key=%s'%id,
callback=self.index_parse,headers=self.headers,cookies=self.cookie,
meta=meta)
def index_parse(self, response):
try:
url = response.xpath('//a[@class="name "]/@href').extract()[0]
# for element in response.css('#web-content > div > div.container-left > div > div.result-list>div'):
# url=element.css('div.header>a.name::attr(href)').extract()[0]
yield scrapy.Request(url=url,
callback=self.parse_campany,headers=self.headers,cookies=self.cookie,
meta=response.meta)
except:
oldinfo = response.meta['oldinfo']
print('提取詳情頁(yè)失敗{}'.format(oldinfo))
redis_cli.sadd('tianyancha',oldinfo[3])
with open('searchfailed.csv','a',encoding='utf-8',newline='')as j:
writer = csv.writer(j)
writer.writerow(oldinfo)
print('存入失敗csv')
def parse_campany(self, response):
# print(response.text)
oldinfo = response.meta['oldinfo']
# try:
# 企業(yè)名稱(chēng)
# campany=response.css('#company_web_top > div.box > div.content > div.header > h1.name::text').extract()[0]
# zzjgdm=response.meta['id']
# 行業(yè)
try:
hy=response.css('#_container_baseInfo > table.table.-striped-col.-border-top-none > tbody > tr:nth-child(3) > td:nth-child(4)::text').extract()[0]
except:
hy=''
# 登記機(jī)關(guān)
try:
djjj=response.css('#_container_baseInfo > table.table.-striped-col.-border-top-none > tbody > tr:nth-child(6) > td:nth-child(4)::text').extract()[0]
except:
djjj=''
# 地址
try:
zcdz=response.css('#_container_baseInfo > table.table.-striped-col.-border-top-none > tbody > tr:nth-child(8) > td:nth-child(2)::text').extract()[0]
except:
zcdz=''
# 經(jīng)營(yíng)范圍
try:
jyfw = response.xpath('//span[@class="js-full-container"]/text()').extract()[0]
if not jyfw:
jyfw=self.dr.sub('',response.css('#_container_baseInfo > table.table.-striped-col.-border-top-none > tbody > tr:nth-child(9) > td:nth-child(2)').extract()[0])
# if '詳情' in jyfw:
except:
jyfw=''
# 簡(jiǎn)介
try:
introduction=response.xpath('//div[@class="summary"]/span[2]/text()').extract()[0]
if introduction != '暫無(wú)信息':
introduction=response.xpath('//div[@class="summary"]/script[1]/text()').extract()[0].strip()
except:
introduction=''
#狀態(tài)
try:
status = response.xpath('//div[contains(./text(),"公司狀態(tài)")]/following::div[1]/@title').extract()[0]
except:
status=''
# 電話
# phone = response.xpath('//div[@class="detail"]/div[1]/div[1]/span[2]/text()').extract()[0]
try:
phone = response.xpath('//span[contains(./text(),"電話:")]/following::span[1]/text()').extract()[0]
except:
phone=''
# 網(wǎng)址
# site = response.xpath('//div[@class="detail"]/div[2]/div[1]/span[2]/text()').extract()[0]
try:
site = response.xpath('//span[contains(./text(),"網(wǎng)址")]/following::span[1]/text()').extract()[0]
except:
site=''
# 注冊(cè)資本
try:
register_money = response.xpath('//tbody/tr[1]/td[2]/div[2]/@title').extract()[0]
except:
register_money=''
# 注冊(cè)時(shí)間
# register_time = response.xpath('')
try:
register_time = '加密'
except:
register_time = ''
# 公司類(lèi)型
try:
company_type = response.xpath('//td[contains(./text(),"公司類(lèi)型")]/following::td[1]/text()').extract()[0]
except:
company_type = ''
#組織機(jī)構(gòu)代碼
try:
company_code = response.xpath('//td[contains(./text(),"組織機(jī)構(gòu)代碼")]/following::td[1]/text()').extract()[0]
except:
company_code = ''
for i in [hy,djjj,zcdz,jyfw,introduction,status,phone,site,register_money,register_time,company_type,company_code]:
oldinfo.append(i)
with open('result2.csv','a',encoding='utf-8',newline='')as h:
writer = csv.writer(h)
writer.writerow(oldinfo)
num = response.meta['num']
print('第{}個(gè)寫(xiě)入完成 ---- {}'.format(num,oldinfo[3]))
redis_cli.sadd('tianyancha',oldinfo[3])
# except Exception as e :
# print('數(shù)據(jù)解析失敗',e)
# with open('failed.csv','a',encoding='utf-8',newline='')as j:
# writer = csv.writer(j)
# writer.writerow(oldinfo)
# print('存入失敗csv')
# import csv
# all=[]
# with open('/Users/admin/Downloads/tianyancha/tianyancha/needs.csv')as g:
# reader = csv.reader(g)
#
# for row in reader:
# all.append(row[3])
# print(len(all))
# print(len(set(all)))
python爬天眼查企業(yè)詳情頁(yè)數(shù)據(jù),天眼查爬蟲(chóng)
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書(shū)系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書(shū)系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。
推薦閱讀更多精彩內(nèi)容
- 數(shù)據(jù)詳情 數(shù)據(jù)詳情 日課35:“我們不確定情境下的人生重大決策的秘密——當(dāng)你“意識(shí)”到要做一個(gè)決定的時(shí)候,你的情緒...
- 任務(wù): 1、獲取網(wǎng)址:http://sh.58.com/pbdn 中商品詳情鏈接地址2、在商品詳情頁(yè)中獲取:類(lèi)別-...
- 現(xiàn)在到處都流行著各種各樣的建議,這些看上去很美的句子,聽(tīng)上去很舒心的句子,真的可能在某一刻給我們帶來(lái)極大的釋?xiě)眩?..
- 喬喬一直都不知道怎么去形容林琳。 她冷靜,她干練,她直白,她不掩飾,她不做作。但同時(shí),難以接近。 她可以在你渾身火...
- 人生會(huì)被無(wú)關(guān)的人影響,真鬧心! 作者/閣樓上的瘋女人 請(qǐng)那位說(shuō)過(guò)不再踏進(jìn)我們家門(mén)半步的人真的不要進(jìn)我們家好嗎?你去...