第二周、第三節,短線重連程序及抓取分類代碼url_list

#conut程序

import timefrom get_parting import url_listwhile True: print(url_list.find().count()) time.sleep(5)

#main程序及多進程
from multiprocessing import Poolfrom channl_cates import channl_listfrom page_parting import get_countent, url_list, item_info, get_links_fromdb_urls = [item['url'] for item in url_list.find()]index_urls = [item['url'] for item in item_info.find()]x = set(db_urls)y = set(index_urls)rest_of_urls = x-ydef get_all_links_from(channel):def get_all_links_from(channel): for num in range(2,101,1): get_links_from(channel,1,num)if name=='main': pool=Pool() pool.map(get_all_links_from,channl_list.split())

#獲取url_list程序

from bs4 import BeautifulSoupimport requestsimport timeimport pymongoclient = pymongo.MongoClient('localhost',27017)jort58 = client['jort58_']url_list = jort58['url_list']item_info=jort58['item_info']def get_links_from(channl,who_sell,pages): links_views='{}{}/pn{}'.format(channl,str(who_sell),str(pages)) wb_data=requests.get(links_views) time.sleep(1) soup=BeautifulSoup(wb_data.text,'lxml') if soup.find('td','t'): for link in soup.select('td.t a.t'): item_links=link.get('href').split('?')[0] if url_list.find_one({'url':item_links}): pass else: url_list.insert_one({'url':item_links}) print(item_links) else: pass

最后編輯于
?著作權歸作者所有,轉載或內容合作請聯系作者
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發布,文章內容僅代表作者本人觀點,簡書系信息發布平臺,僅提供信息存儲服務。

推薦閱讀更多精彩內容