黑帮大佬和我的360天结局,欧美胖老太牲交大战,久久久综合亚洲色一区二区三区

今天是4.13號。

昨天把會議論文算是完成任務的寫完然后提交了，而實習還沒有找上，所以最近一段時間應該都會整天在實驗室學習python吧，加上最近一個多星期全部都是大雨哪也去不了（說的好像不下雨就會出去轉悠一樣。本來還想問一下送宋教授現在有什么項目可以跟過去做，但又怕把python的學習拉下，所以還是最近半個月先把這個課程全部學完吧。另外電腦運行pycharm真心帶不動，所以也在等家里的那臺筆記本寄過來，同時不得不提的是也在等投稿的論文消息，wish there is a good result。

照樣在貼上代碼之前，總結在實際中新學的知識與所遇到的問題。
(1).快捷鍵ctrl+/可以多行注釋，全部選定后tab可以多行縮進，shift+tab則可以向左縮進。
(2).注意select('')和split('')得到的結果都是列表，所以都要在后面加下標[number]。
(3).X.stripped_strings 用于去除字符串X中包含的空格或空行。同時注意要用list()把那一串數據括起來。
(4).對于多種分類情況時，最好用if語句來進行判斷。判斷某特點字符串s1是包含在另一字符串s2中，可用if 's1' in 's2'

(5).要關注抓取的數據是網頁自帶的，還是通過request返回的json數據，一般json都是字典數據。對于瀏覽量等JS數據，首先在審查元素的network-JS中找到相關網頁，然后進行解析。
解析過程包括：將查詢網頁的id導出，然后用format()直接替換到相應的JS動態網頁構造成新的網頁；接著跟一般網頁解析一樣用requests.get()去請求；最后由于JS網頁的回應內容都是字符串，所以直接用js.text然后再用相應的split或其他方法截取自己想要的內容。
還一個問題要注意，對于請求JS數據時，記得加上headers包括： 'Referer'和 'User-Agent'

第一段

__author__ = 'guohuaiqi'
#!/usr/bin/env python
# _*_ coding: utf-8 _*_
from bs4 import BeautifulSoup
import requests
import string

url='http://bj.58.com/sale.shtml'
host='http://bj.58.com'

#得到所有商品類目的鏈接并保存下來
def get_cate_link(url):
    web_data=requests.get(url)
    soup=BeautifulSoup(web_data.text,'lxml')
    allurl=soup.select('#ymenu-side > ul > li > ul > li > b > a')
    for item in allurl:
        cate_link=host+item.get('href')
        #print(cate_link)

# get_cate_link(url)

cate_list="""
    http://bj.58.com/shouji/
    http://bj.58.com/tongxunyw/
    http://bj.58.com/danche/
    http://bj.58.com/fzixingche/
    http://bj.58.com/diandongche/
    http://bj.58.com/sanlunche/
    http://bj.58.com/peijianzhuangbei/
    http://bj.58.com/diannao/
    http://bj.58.com/bijiben/
    http://bj.58.com/pbdn/
    http://bj.58.com/diannaopeijian/
    http://bj.58.com/zhoubianshebei/
    http://bj.58.com/shuma/
    http://bj.58.com/shumaxiangji/
    http://bj.58.com/mpsanmpsi/
    http://bj.58.com/youxiji/
    http://bj.58.com/jiadian/
    http://bj.58.com/dianshiji/
    http://bj.58.com/ershoukongtiao/
    http://bj.58.com/xiyiji/
    http://bj.58.com/bingxiang/
    http://bj.58.com/binggui/
    http://bj.58.com/chuang/
    http://bj.58.com/ershoujiaju/
    http://bj.58.com/bangongshebei/
    http://bj.58.com/diannaohaocai/
    http://bj.58.com/bangongjiaju/
    http://bj.58.com/ershoushebei/
    http://bj.58.com/yingyou/
    http://bj.58.com/yingeryongpin/
    http://bj.58.com/muyingweiyang/
    http://bj.58.com/muyingtongchuang/
    http://bj.58.com/yunfuyongpin/
    http://bj.58.com/fushi/
    http://bj.58.com/nanzhuang/
    http://bj.58.com/fsxiemao/
    http://bj.58.com/xiangbao/
    http://bj.58.com/meirong/
    http://bj.58.com/yishu/
    http://bj.58.com/shufahuihua/
    http://bj.58.com/zhubaoshipin/
    http://bj.58.com/yuqi/
    http://bj.58.com/tushu/
    http://bj.58.com/tushubook/
    http://bj.58.com/wenti/
    http://bj.58.com/yundongfushi/
    http://bj.58.com/jianshenqixie/
    http://bj.58.com/huju/
    http://bj.58.com/qiulei/
    http://bj.58.com/yueqi/
    http://bj.58.com/tiaozao/
"""

第二段

__author__ = 'guohuaiqi'
# !/usr/bin/env python
# _*_ coding: utf-8 _*_
from bs4 import BeautifulSoup
import requests
import time
import pymongo
import sys

client=pymongo.MongoClient('localhost',27017)
tongcheng=client['tongcheng']
urllist=tongcheng['urllist']
content=tongcheng['content']


#爬取所有商品的鏈接保存下來,這里的url來自cate_list
def get_content_links(cate_url,page):
    # http://bj.58.com/danche/pn2/ 這里要構造函數，不然傳來的類目鏈接只是進來后的首頁
    page_list='{}pn{}/'.format(cate_url,str(page))
    web_data=requests.get(page_list)
    soup=BeautifulSoup(web_data.text,'lxml')
    time.sleep(1)
    if soup.find('td','t'):
        allurl=soup.select('td.t a.t')
        for url1 in allurl:
            content_link=url1.get('href').split('?')[0]
            if 'bj.58.com' not in content_link:
                pass
            else:
                urllist.insert_one({'url':content_link})
                # print(content_link)
                get_item_content(content_link)
    else:
        pass

# cate_url='http://bj.58.com/youxiji/'
# get_content_links(cate_url,20)

# 爬取每個頁面的詳情內容,包括標題，時間，價格，區域
def get_item_content(content_link):
# 先判斷數據是否來自58，將來自精品或者轉轉的數據，統一不要
#     for url2 in content_link:
#         if 'bj.58.com' not in url2:
#             pass
#         else:
    try:
        web_data1=requests.get(content_link)
        soup=BeautifulSoup(web_data1.text,'lxml')
        page_not_exist = '404' in soup.find('script',type='text/javascript').get('src').split('/')
        if page_not_exist:
            pass
        else:
            if '區域' in soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_tit')[0].get_text():
                if soup.find_all('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_con > span'):
                    district=list(soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_con > span')[0].stripped_strings)
                else:
                    district=list(soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_con')[0].stripped_strings)
            elif '區域' in soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_tit')[0].get_text():
                if soup.find_all('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con > span'):
                    district=list(soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con > span')[0].stripped_strings)
                else:
                    district=list(soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con')[0].stripped_strings)
            else:
                district=None
            data={
                'goods_cate':soup.select('#header > div.breadCrumb.f12 > span:nth-of-type(3) > a')[0].text.strip(),
                'title':soup.select('#content h1')[0].text.strip(),
                'date':soup.select('#content li.time')[0].text.replace('.','-'),
                'price':soup.select('span.price.c_f50')[0].text.replace('元','').strip() if '面議'not in soup.select('span.price.c_f50')[0].text else None,
                'district':district
                }
            content.insert_one(data)
            # print(data)
    except requests.ConnectionError as e:
        print(e.response)
#
# b=['http://bj.58.com/shuma/23190415633187x.shtml','http://bj.58.com/yishu/25471342844357x.shtml','http://bj.58.com/shouji/25683386143296x.shtml','http://bj.58.com/shuma/23425779899550x.shtml']
# get_item_content(b)
# get_content_links('http://bj.58.com/shouji/',20)

第三段

# _*_ coding: utf-8 _*_
#!/usr/bin/env python
__author__ = 'guohuaiqi'
from multiprocessing import Pool
from get_cate_link import cate_list
from get_all_contents import get_content_links,urllist,content

# 加入斷點續傳機制，在出現斷開后，用rest_list替換pool,map()函數中的cate_links
db_urllist=[item['url'] for item in urllist.find()]
content_urllist=[item['url'] for item in content.fina()]
x=set(db_urllist)
y=set(content_urllist)
rest_list=x-y

def get_all_links(cate_url):
    for page in range(1,101):
        get_content_links(cate_url,page)

if __name__=='__main__':
    pool=Pool()
    pool.map(get_all_links,cate_list.split())

第四段
最后再加上一個count函數來對數據庫中的item計數

__author__ = 'guohuaiqi'
# !/usr/bin/env python
# _*_ coding: utf-8 _*_
import time
from get_all_contents1 import content

while True:
    print(content.find().count())
    time.sleep(3)

再要注意的就是，一定一定在寫代碼前在最前面加上：
#!/usr/bin/env python
__ coding: utf-8 __**

在爬取了10745條數據后自己手動停止了程序，一共花了差不多12分鐘。

三个男躁一个女,国精产品一区一手机的秘密,麦子交换系列最经典十句话,欧美国产综合欧美视频

Python學習日記3| 用python多進程爬取58同城北京地區10w+數據

Python學習日記3| 用python多進程爬取58同城北京地區10w+數據

推薦閱讀更多精彩內容

三个男躁一个女,国精产品一区一手机的秘密,麦子交换系列最经典十句话,欧美 国产 综合 欧美 视频

Python學習日記3| 用python多進程爬取58同城北京地區10w+數據

推薦閱讀更多精彩內容

三个男躁一个女,国精产品一区一手机的秘密,麦子交换系列最经典十句话,欧美国产综合欧美视频