爬蟲方法封裝及存儲(以糗百為例)

# 1.爬取作者信息(頭像/昵稱/性別/年齡)
# 2.帖子內容,好笑數,評論數
# 3.存入 csv

# 翻頁處理方式
# 1.分開處理   一頁一頁處理 (例:爬取當前頁后直接存儲)
# 2.翻頁到結束處理  從首頁翻頁到尾頁 (例:翻頁到尾頁后統一處理)
import csv
import pymysql
import requests
from bs4 import BeautifulSoup

#爬取一頁的內容
def qb_spider1(page=1):
    url_temp = "https://www.qiushibaike.com/text/page/{}/".format(page)
    #print(url_temp)
    rep=requests.get(url=url_temp)
    soup=BeautifulSoup(rep.text,"lxml")
    articles = soup.find_all("div", class_='article')
    article_infos = []
    datas=[]
    for article in articles:
        author_div=article.find("div",class_="author")
        author_img=article.img
        author_icon=author_img.attrs['src']
        author_icon="https:"+author_icon
        author_name = author_img.attrs['alt']

        if "匿名用戶" not in author_name:
            article_gender = author_div.div
            author_age = article_gender.text.strip()
            article_gender_class = article_gender.attrs['class']
            author_gender="man" if "man" in article_gender_class else "woman"
        else:
            author_age = -1
            author_gender = "no"

        article_content = article.find("div", class_='content')
        article_stats = article.find_all("i", class_='number')
        #print(article_stats)

        article_content = article_content.span.text.strip()
        stats_vote = article_stats[0].text.strip()
        stats_comment = article_stats[1].text.strip()

        #雙列表存儲
        article_info = [author_icon, author_name, author_gender, author_age, article_content, stats_vote, stats_comment]
        article_infos.append(article_info)

        #存入字典在存入列表
        item={}
        item['頭像']=author_icon
        item['用戶名'] =author_name
        item['性別'] =author_gender
        item['年齡'] =author_age
        item['段子'] =article_content
        item['好笑數'] =stats_vote
        item['評論'] =stats_comment
        datas.append(item)




    next_tag=soup.find('span',class_='next')
    has_next=False
    if next_tag is None:
        has_next=False
    else:
        has_next=True

    return datas,has_next


# 2.翻頁到結束處理  從首頁翻頁到尾頁 (例:翻頁到尾頁后統一處理)
def qb_spider2(page=1):
    url_temp = "https://www.qiushibaike.com/text/page/{}/".format(page)
    # print(url_temp)
    rep = requests.get(url=url_temp)
    soup = BeautifulSoup(rep.text, "lxml")
    articles = soup.find_all("div", class_='article')
    article_infos = []
    for article in articles:
        author_div = article.find("div", class_="author")
        author_img = article.img
        author_icon = author_img.attrs['src']
        author_icon = "https:" + author_icon
        author_name = author_img.attrs['alt']

        if "匿名用戶" not in author_name:
            article_gender = author_div.div
            author_age = article_gender.text.strip()
            article_gender_class = article_gender.attrs['class']
            author_gender = "man" if "man" in article_gender_class else "woman"
        else:
            author_age = -1
            author_gender = "no"

        article_content = article.find("div", class_='content')
        article_stats = article.find_all("i", class_='number')
        # print(article_stats)

        article_content = article_content.span.text.strip()
        stats_vote = article_stats[0].text.strip()
        stats_comment = article_stats[1].text.strip()

        # 雙列表存儲
        article_info = [author_icon, author_name, author_gender, author_age, article_content, stats_vote, stats_comment]
        article_infos.append(article_info)

    next_tag = soup.find('span', class_='next')
    has_next = next_tag is not None
    if has_next:
        article_infos.extend(qb_spider2(page + 1))  #調用自己本身

    return article_infos

#雙列表寫入csv
def list_write_to_csv(data_, filename):
    with open(filename + '.csv', "w", newline='', encoding='utf-8-sig') as f:
        writer = csv.writer(f)
        writer.writerows(data_)

#列表存字典寫入csv
def dict_write_to_csv(data_,filename):
    with open(filename + '.csv','w',encoding='utf-8',newline='')as f:
        writer=csv.writer(f)
        writer.writerow(data_[0].keys())
        for data in data_:
            writer.writerow(data.values())

#存入數據庫(提前建好表)
def write_to_mysql(item):
    author_icon=item['頭像']
    author_name=item['用戶名']
    author_gender=item['性別']
    author_age=item['年齡']
    article_content=item['段子']
    stats_vote= item['好笑數']
    stats_comment=item['評論']

    conn=pymysql.connect(host='192.168.169.157',user='root',passwd='123456',db='haha',port=3306,charset='utf8')
    with conn:
        cur=conn.cursor()
        insert_sql="insert into qiubai (author_icon,author_name,author_gender,author_age,article_content,stats_vote,stats_comment) values(%s,%s,%s,%s,%s,%s,%s)"
        cur.execute(insert_sql,(author_icon,author_name,author_gender,author_age,article_content,stats_vote,stats_comment))
        conn.commit()
       



def main1():
    articles=qb_spider1()
    page=1
    has_next=True
    all_articles = []
    while has_next:
        articles, has_next = qb_spider1(page)
        page += 1
        all_articles.extend(articles)
        # 用extend 不能用append
        # 使用extend的時候,是將articles看作一個序列,將這個序列和all_articles序列合并,并放在其后面。【   】
        # 使用append的時候,是將articles看作一個對象,整體打包添加到all_articles對象中。  【   【 】 】
        print(has_next, page)
    for item in all_articles:
        print(item)
    #dict_write_to_csv(all_articles,'qbpachong')
        write_to_mysql(item)


def main2():
    all_articles = qb_spider2()
    list_write_to_csv(all_articles, "qiushibaike_text")
    for item in all_articles:
        print(item)


if __name__=="__main__":
    main1()

自己常用的兩種儲存方式

字典寫入csv.png
雙列表形式寫入csv.png
數據存儲至MySQL.png
最后編輯于
?著作權歸作者所有,轉載或內容合作請聯系作者
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發布,文章內容僅代表作者本人觀點,簡書系信息發布平臺,僅提供信息存儲服務。

推薦閱讀更多精彩內容

  • 你既沒手也無腳 時不時還遭人鄙棄 可為了你庭院的美好 你,你從不考慮我們人的煩惱 總是風馳電掣地旋轉 我來不及站穩...
    平心如我閱讀 224評論 0 0
  • "別人的眼光真的那么重要嗎?"這個問題我在心里問過自己無數次,每次都有新的思考。在乎別人的眼光的人沒有錯,但是...
    一曄閱讀 899評論 0 1
  • 早上我6點過就醒了,跟朋友約起去幫我找青苔。到兒子臥室看,兒子已經醒了,我就去煮餃子,煮好了,我叫兒子起來...
    紫玉_b836閱讀 268評論 2 2