爬取鏈家租房信息

frombs4importBeautifulSoup

importrequests

#準備網絡連接

#pc端

urls=['http://bj.lianjia.com/zufang/pg{}/'.format(str(i))foriinrange(1,101)]

#手機端

murls=['http://m.lianjia.com/bj/zufang/pg{}'.format(str(i))foriinrange(1,101)]

#爬取PC端的數據

defpachongpc(url):

web_date = requests.get(url)

web_date.encoding ='utf-8'

soup = BeautifulSoup(web_date.text,'lxml')

names = soup.select('#house-lst > li > div.info-panel > h2 > a')

adrs = soup.select('#house-lst > li > div.info-panel > div.col-1 > div.where > a > span')

styles = soup.select('#house-lst > li > div.info-panel > div.col-1 > div.where > span.zone > span')

areas = soup.select('#house-lst > li > div.info-panel > div.col-1 > div.where > span.meters')

prices = soup.select('#house-lst > li > div.info-panel > div.col-3 > div.price > span')

times = soup.select('#house-lst > li > div.info-panel > div.col-3 > div.price-pre')

imgs = soup.select('#house-lst > li > div.pic-panel > a > img')

data = []

# print(len(names),len(adrs),len(styles),len(areas),len(prices),len(times),len(imgs))

forname, adr, style, area, price, time, imginzip(names, adrs, styles, areas, prices, times, imgs):

info = {

#'name': name.get_text().replace(u'\xa0\xa0', u'').split(' ')[0],

'adr': adr.get_text().replace(u'\xa0\xa0',u''),

'style': style.get_text().replace(u'\xa0\xa0',u''),

'area': area.get_text().replace(u'\xa0\xa0',u''),

'price': price.get_text().replace(u'\xa0\xa0',u''),

'time': time.get_text().replace(u'\xa0\xa0',u''),

'img': img.get('src').replace(u'\xa0\xa0',u'')

}

data.append(info)

print(data)

#爬取mobile端的數據

defpachongmo(url):

web_date = requests.get(url)

web_date.encoding ='utf-8'

soup = BeautifulSoup(web_date.text,'lxml')

names = soup.select('section.page.page_zufang > div > div.mod_box.house_lists > div.mod_cont > ul > li > div > div.item_list > div.item_main.text_cut')

adrs = soup.select(' section.page.page_zufang > div > div.mod_box.house_lists > div.mod_cont > ul > li > div > div.item_list > div.item_other')

styles = soup.select(' section.page.page_zufang > div > div.mod_box.house_lists > div.mod_cont > ul > li > div > div.item_list > div.item_minor > div.info')

prices = soup.select('div > div.mod_box.house_lists > div.mod_cont > ul > li > div > div.item_list > div.item_minor > div.price_total.q_rentprice')

cate = soup.select('section.page.page_zufang > div > div.mod_box.house_lists > div.mod_cont > ul > li > div > div.item_list > div.tag_box')

imgs = soup.select('section.page.page_zufang > div > div.mod_box.house_lists > div.mod_cont > ul > li > div > div.mod_media > div > img')

data = []

# print(len(names),len(adrs),len(styles),len(areas),len(prices),len(times),len(imgs))

forname, adr, style, price, cate, imginzip(names, adrs, styles, prices, cate, imgs):

info = {

#'name': name.get_text().replace(u'\xa0\xa0', u'').split(' ')[0],

'adr': adr.get_text().replace(u'\xa0\xa0',u''),

'style': style.get_text().replace(u'\xa0\xa0',u''),

'price': price.get_text().replace(u'\xa0\xa0',u''),

'cate':list(cate.stripped_strings),

'img': img.get('src').replace(u'\xa0\xa0',u'')

}

data.append(info)

print(data)

if__name__ =="__main__":

forurlinurls:

pachongpc(url)

formurlinmurls:

pachongmo(murl)

注:可以使用標簽加方括號的方式來定位到某一個特定的標簽‘div>div.property_title>a[target='_blank']’

最后編輯于
?著作權歸作者所有,轉載或內容合作請聯系作者
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發布,文章內容僅代表作者本人觀點,簡書系信息發布平臺,僅提供信息存儲服務。

推薦閱讀更多精彩內容