国内精品久久人妻互换,杨思敏被揉到高潮下不了床,av激情亚洲男人的天堂国语

1、確定目標：

???????? 首先我們這次要扒拉的是lagou網(wǎng)的職位信息(www.lagou.com)

2、確定接口：

??????? 打開地址，檢查元素。

拉鉤截圖

3、分析參數(shù)：

????? 查看參數(shù)后發(fā)現(xiàn)： pn:頁碼? kd:關(guān)鍵字? first:是否為第一次檢索；

???? 所以我們得到了一個API地址：https://www.lagou.com/jobs/positionAjax.json?city=深圳&needAddtionalResult=false

4、編寫代碼：

開始擼碼：

import requests

def fetchURL(url):

headers = {

? ? ? ? 'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',

? ? ? ? 'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:70.0) Gecko/20100101 Firefox/70.0'

? ? }

? ? try:

? ? ? ? r = requests.get(url,headers=headers)

? ? ? ? r.raise_for_status()

? ? ? ? print(r.url)

? ? ? ? return r.text

? ? except requests.HTTPError as e:

? ? ? ? print(e)

? ? ? ? print("HTTPError")

? ? except requests.RequestException as e:

? ? ? ? print(e)

? ? except:

? ? ? ? print("UnKown Error!!!!!")

if __name__ == "__main__":

? ? url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'

? ? data = {

? ? ? ? 'first':'true',

? ? ? ? 'pn':'1',

? ? ? ? 'kd':'python',

? ? }

? ? html = get_data(url,data)

? ? print(html)

浪一波：結(jié)果跪了?。。?！提示什么呢？

{'status': False, 'msg': '您操作太頻繁,請稍后再訪問', 'clientIp': '218.17.*.*', 'state': 2402}

納尼？？？都還沒開跑就操作太頻繁，有點太過了吧?。?！

在原來的基礎(chǔ)上改一下，是不是Referer的問題呢，我們試試看：

headers = {

? ? ? ? 'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',

? ? ? ? 'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:70.0) Gecko/20100101 Firefox/70.0'

??????? 'Referer': 'https://www.lagou.com/jobs/list_ios?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=?&labelWords=hot',

? ? }

再來一發(fā)：

結(jié)果提示同樣的問題內(nèi)容。那么是不是我們的第一次就這么的草草結(jié)束呢！！

哈哈....把所有的請求頭都添加到headers里面再試一波......

還是跪了......lagou你要不要這樣子呀，不就扒個資源嘛，不讓扒呀......

沒辦法不會弄呀，找度娘吧!!

發(fā)現(xiàn)都是半年前的扒過的，都沒最新的....

仔細分析API接口發(fā)現(xiàn)cookie

仔細分析，發(fā)現(xiàn)問題

有網(wǎng)友說cookie的問題，每次刷新cookie都在變化，但是不是cookie的問題呢，再優(yōu)化一下

import requests

def get_data(url,data,cookies):

? ? try:

? ? ? ? r = s.post(url, data=data, headers=headers, cookies=cookie, timeout=3)?

? ? ? ? r.raise_for_status()

? ? ? ? r.encoding = r.apparent_encoding

? ? ? ? return r.text

? ? except requests.HTTPError as e:

? ? ? ? print('HttpError == >',e)

? ? except requests.RequestException as e:

? ? ? ? print("RequestExcepiton == >",e)

? ? except:

? ? ? ? print('UnKnown Error !!!!')

# 獲取的cookies

def get_cookies(url,headers):

? ? s = requests.Session()

? ? s.get(url, headers=headers, timeout=3)? # 獲取cookies

? ? return s.cookies

if __name__ == "__main__":

? ? url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'

? ? url2 = 'https://www.lagou.com/jobs/list_python?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput='

? ? headers = {

? ? ? ? 'Host': 'www.lagou.com',

? ? ? ? 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:70.0) Gecko/20100101 Firefox/70.0',

? ? ? ? 'Accept': 'application/json, text/javascript, */*; q=0.01',

? ? ? ? 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',

? ? ? ? 'Accept-Encoding': 'gzip, deflate, br',

? ? ? ? 'X-Requested-With': 'XMLHttpRequest',

? ? ? ? 'Referer': 'https://www.lagou.com/jobs/list_ios?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=?&labelWords=hot',

? ? }

? ? data = {

? ? ? ? 'first':'true',

? ? ? ? 'pn':'1',

? ? ? ? 'kd':'python',

? ? }

? ? cookie = get_cookies(url2,headers)

? ? html = get_data(url,data,cookie)

? ? print(html)

哈哈.....魔性的笑聲，自行補腦....。

5、最后處理數(shù)據(jù)，保存數(shù)據(jù)

優(yōu)化一下，數(shù)據(jù)邏輯處理，然后保存文件。

以下就是終極代碼，其實還可以優(yōu)化一下......

就此Python網(wǎng)絡(luò)爬蟲第一次就完成了.....

import requests

import time

import json

import pandas as pd

def main():

? ? get_data()

def get_data():

? ? url_getCookie = "https://www.lagou.com/jobs/list_iOS?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput="

? ? url_parse = "https://www.lagou.com/jobs/positionAjax.json?city=深圳&needAddtionalResult=false"

? ? headers = {

? ? ? ? 'Accept': 'application/json, text/javascript, */*; q=0.01',

? ? ? ? 'Referer': 'https://www.lagou.com/jobs/list_iOS?labelWords=&fromSearch=true&suginput=',

? ? ? ? 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'

? ? }

? ? comments = []

? ? hlist = []

? ? hlist.append('職位')

? ? hlist.append('公司名稱')

? ? hlist.append('薪水范圍')

? ? hlist.append('規(guī)模')

? ? hlist.append('行業(yè)')

? ? hlist.append('學(xué)歷要求')

? ? hlist.append('工作年限')

? ? hlist.append('優(yōu)勢')

? ? hlist.append('創(chuàng)建時間')

? ? hlist.append('地址')

? ? hlist.append('精度')

? ? hlist.append('緯度')

? ? comments.append(hlist)

? ? #writePage(hlist)

??? #text['content']['positionResult']['totalCount']總共有280條，
??? #text['content']['pageSize']每頁15條數(shù)據(jù)

? ? for x in range(1, 21):

? ? ? ? data = {

? ? ? ? ? ? 'first': 'true',

? ? ? ? ? ? 'pn': str(x),

? ? ? ? ? ? 'kd': 'iOS'

? ? ? ? ? ? ? ? }

? ? ? ? s = requests.Session()

? ? ? ? # 請求lagou獲取cookies

? ? ? ? s.get(url_getCookie, headers=headers, timeout=3)?

? ? ? ? # 獲取cookies

? ? ? ? cookie = s.cookies?

? ? ? ? response = s.post(url_parse, data=data, headers=headers, cookies=cookie, timeout=3)

? ? ? ? #休眠5秒

? ? ? ? time.sleep(5)

? ? ? ? #編碼

? ? ? ? response.encoding = response.apparent_encoding

? ? ? ? #轉(zhuǎn)JSON

? ? ? ? text = json.loads(response.text)

? ? ? ? #print(text)

? ? ? ? info = text['content']['positionResult']['result']

? ? ? ? for i in info:

? ? ? ? ? ? list = []

? ? ? ? ? ? #職位

? ? ? ? ? ? positionName = i['positionName']

? ? ? ? ? ? #公司名稱

? ? ? ? ? ? companyFullName = i['companyFullName']

? ? ? ? ? ? #薪水范圍

? ? ? ? ? ? salary = i['salary']

? ? ? ? ? ? #公司規(guī)模

? ? ? ? ? ? companySize = i["companySize"]

? ? ? ? ? ? #行業(yè)(industryField)

? ? ? ? ? ? industryField = i['industryField']

? ? ? ? ? ? #學(xué)歷要求(education)

? ? ? ? ? ? education = i['education']

? ? ? ? ? ? #工作年限(workYear)

? ? ? ? ? ? workYear = i['workYear']

? ? ? ? ? ? #優(yōu)勢(advantage)

? ? ? ? ? ? advantage = i['positionAdvantage']

? ? ? ? ? ? #創(chuàng)建時間(createTime)

? ? ? ? ? ? createTime = i['createTime']

? ? ? ? ? ? #地址(city+district+stationname)

? ? ? ? ? ? print(positionName,companyFullName,createTime)

? ? ? ? ? ? if i['stationname'] is None:

? ? ? ? ? ? ? ? if i['district'] is None:

? ? ? ? ? ? ? ? ? ? adress = i['city']

? ? ? ? ? ? ? ? else:

? ? ? ? ? ? ? ? ? ? adress = i['city'] + i['district']

? ? ? ? ? ? else:

? ? ? ? ? ? ? ? adress = i['city'] + i['district'] + i['stationname']

? ? ? ? ? ? #精度 (longtitude)

? ? ? ? ? ? longitude = i['longitude']

? ? ? ? ? ? #緯度(latitude)

? ? ? ? ? ? latitude = i['latitude']

? ? ? ? ? ? list.append(positionName)

? ? ? ? ? ? list.append(companyFullName)

? ? ? ? ? ? list.append(salary)

? ? ? ? ? ? list.append(companySize)

? ? ? ? ? ? list.append(industryField)

? ? ? ? ? ? list.append(education)

? ? ? ? ? ? list.append(workYear)

? ? ? ? ? ? list.append(advantage)

? ? ? ? ? ? list.append(createTime)

? ? ? ? ? ? list.append(adress)

? ? ? ? ? ? list.append(longitude)

? ? ? ? ? ? list.append(latitude)

? ? ? ? ? ? comments.append(list)

? ? ? ? print('-----'*15)

? ? writePage(comments)

def writePage(connects):

? ? dataframe = pd.DataFrame(connects)

? ? dataframe.to_csv('lagou_comment.csv',encoding='utf_8_sig', mode='a', index=False, sep=',', header=False)

if __name__ == '__main__':

? ? main()

三个男躁一个女,国精产品一区一手机的秘密,麦子交换系列最经典十句话,欧美国产综合欧美视频

Python 初嘗lagou職位

Python 初嘗lagou職位

1、確定目標：

2、確定接口：

3、分析參數(shù)：

4、編寫代碼：

仔細分析API接口發(fā)現(xiàn)cookie

5、最后處理數(shù)據(jù)，保存數(shù)據(jù)

推薦閱讀更多精彩內(nèi)容

三个男躁一个女,国精产品一区一手机的秘密,麦子交换系列最经典十句话,欧美 国产 综合 欧美 视频

Python 初嘗lagou職位

1、確定目標：

2、確定接口：

3、分析參數(shù)：

4、編寫代碼：

仔細分析API接口發(fā)現(xiàn)cookie

5、最后處理數(shù)據(jù)，保存數(shù)據(jù)

推薦閱讀更多精彩內(nèi)容

三个男躁一个女,国精产品一区一手机的秘密,麦子交换系列最经典十句话,欧美国产综合欧美视频

1、確定目標：

3、分析參數(shù)：

4、編寫代碼：

5、最后處理數(shù)據(jù)，保存數(shù)據(jù)