代碼已經(jīng)不可用!token也不能用了!需要數(shù)據(jù)請聯(lián)系微信bcdata
這里的代碼并不是最新的,請到https://github.com/derekhe/bike-crawler獲取最新代碼
該爬蟲為單車地圖的Python演示代碼,具備以下功能:
- 支持ofo和摩拜
- 多線程爬取
- 自動去重
- 按照ofo和摩拜輸出對應(yīng)的csv文件,存放在db/【日期】/【日期】-【時(shí)間】-【品牌】.csv文件內(nèi)
運(yùn)行環(huán)境:
- Python3
運(yùn)行前請聯(lián)系微信bcdata獲取token,內(nèi)置的token為演示用,單車位置是真實(shí)的,ID是隨機(jī)的。
運(yùn)行:
pip3 install -r requirements.txt
python3 crawler.py
這里的代碼并不是最新的,請到https://github.com/derekhe/bike-crawler獲取最新代碼
import datetime
import json
import os
import os.path
import sqlite3
import threading
import time
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import pandas as pd
import requests
class Crawler:
def __init__(self):
self.start_time = datetime.datetime.now()
self.csv_path = "./db/" + datetime.datetime.now().strftime("%Y%m%d")
os.makedirs(self.csv_path, exist_ok=True)
self.csv_name = self.csv_path + "/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
self.db_name = "./temp.db"
self.lock = threading.Lock()
self.total = 0
self.done = 0
self.bikes_count = 0
def get_nearby_bikes(self, args):
try:
url = "http://www.dancheditu.com:3000/bikes?lat=%s&lng=%s&cityid=%s&token=%s" % (args[0], args[1], args[2], args[3])
headers = {
'charset': "utf-8",
'platform': "4",
'content-type': "application/x-www-form-urlencoded",
'user-agent': "MicroMessenger/6.5.4.1000 NetType/WIFI Language/zh_CN",
'host': "mwx.mobike.com",
'connection': "Keep-Alive",
'accept-encoding': "gzip",
'cache-control': "no-cache"
}
self.request(headers, args, url)
except Exception as ex:
print(ex)
def request(self, headers, args, url):
response = requests.request(
"GET", url, headers=headers,
timeout=30, verify=False
)
with self.lock:
with sqlite3.connect(self.db_name) as c:
try:
decoded = json.loads(response.text)['msg']
self.done += 1
for x in decoded:
self.bikes_count += 1
if x['brand'] == 'ofo':
c.execute("INSERT OR IGNORE INTO ofo VALUES (%d,'%s',%f,%f)" % (
int(time.time()) * 1000, x['id'], x['lat'], x['lng']))
else:
c.execute("INSERT OR IGNORE INTO mobike VALUES (%d,'%s',%f,%f)" % (
int(time.time()) * 1000, x['id'], x['lat'], x['lng']))
timespent = datetime.datetime.now() - self.start_time
percent = self.done / self.total
total = timespent / percent
print("位置 %s, 單車數(shù)量 %s, 進(jìn)度 %0.2f%%, 速度 %0.2f個(gè)/分鐘, 總時(shí)間 %s, 剩余時(shí)間 %s" % (
args, self.bikes_count, percent * 100, self.done / timespent.total_seconds() * 60, total, total - timespent))
except Exception as ex:
print(ex)
def start(self, config):
if os.path.isfile(self.db_name):
os.remove(self.db_name)
try:
with sqlite3.connect(self.db_name) as c:
c.execute(self.generate_create_table_sql('ofo'))
c.execute(self.generate_create_table_sql('mobike'))
except Exception as ex:
print(ex)
pass
executor = ThreadPoolExecutor(max_workers=config['workers'])
print("Start")
self.total = 0
lat_range = np.arange(config['top_lat'], config['bottom_lat'], -config['offset'])
for lat in lat_range:
lng_range = np.arange(config['left_lng'], config['right_lng'], config['offset'])
for lon in lng_range:
self.total += 1
executor.submit(self.get_nearby_bikes, (lat, lon, config['cityid'], config['token']))
executor.shutdown()
self.group_data()
def generate_create_table_sql(self, brand):
return '''CREATE TABLE {0}
(
"Time" DATETIME,
"bikeId" VARCHAR(12),
lat DOUBLE,
lon DOUBLE,
CONSTRAINT "{0}_bikeId_lat_lon_pk"
PRIMARY KEY (bikeId, lat, lon)
);'''.format(brand)
def group_data(self):
print("正在導(dǎo)出數(shù)據(jù)")
conn = sqlite3.connect(self.db_name)
self.export_to_csv(conn, "mobike")
self.export_to_csv(conn, "ofo")
def export_to_csv(self, conn, brand):
df = pd.read_sql_query("SELECT * FROM %s" % brand, conn, parse_dates=True)
df['Time'] = pd.to_datetime(df['Time'], unit='ms').dt.tz_localize('UTC').dt.tz_convert('Asia/Chongqing')
df.to_csv(self.csv_name + "-" + brand + ".csv", header=False, index=False)
# 配置
# 經(jīng)緯度請用百度拾取工具拾取,http://api.map.baidu.com/lbsapi/getpoint/
config = {
# 左邊經(jīng)度
"left_lng": 103.9213455517,
# 上邊維度
"top_lat": 30.7828453209,
# 右邊經(jīng)度
"right_lng": 104.2178123382,
# 右邊維度
"bottom_lat": 30.4781772402,
# 平移量,用于遍歷整個(gè)區(qū)域的最小間隔,請自行調(diào)整,必要時(shí)可以參考www.dancheditu.com
# 參數(shù)過小則抓取太過于密集,導(dǎo)致重復(fù)數(shù)據(jù)過多
# 參數(shù)過大則抓取太過于稀疏,會漏掉一些數(shù)據(jù)
"offset": 0.02,
# 城市id,請參考http://www.dancheditu.com/的FAQ
"cityid": 75,
# 線程數(shù),請合理利用資源,線程數(shù)請不要過大,過大服務(wù)器會返回錯(cuò)誤
"workers": 20,
# token,請加微信bcdata付費(fèi)獲取,demo只能提供單車的真實(shí)位置,但是id號是隨機(jī)的
"token": "demo"
}
Crawler().start(config)
print("完成")