抓取易车车型配置小记
因为易车网车型配置信息为动态加载,导致之前用于抓取参数的python脚本失效,根据原脚本结合scrapy和splash来进行参数采集
splash:用于需要js动态加载的页面,渲染动态js 先启动splash,官方提供有现成镜像,可以直接拿来使用
docker run -d --name=splash --restart=always -p 8050:8050 scrapinghub/splash:latest然后新建一个scrapy项目
scrapy startproject yiche
cd yiche修改scrapy的配置
vi yiche/settingBOT_NAME = 'yiche'
SPIDER_MODULES = ['yiche.spiders']
NEWSPIDER_MODULE = 'yiche.spiders'
ROBOTSTXT_OBEY = False # 是否遵守robot.txt
DOWNLOAD_DELAY = 1 # 抓取速度,防止被服务器封禁,适当降低抓取速度
##### splash 配置 开始#####
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
SPLASH_URL = 'http://localhost:8050/' # splash地址,我直接在本机运行的容器,所以地址为localhost
DUPEFILTER_CLASS = "scrapy_splash.SplashAwareDupeFilter"
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
##### splash 配置 结束#####大体思路 1、主爬虫脚本内容,抓取页面用的是车型对面页面渲染的配置列表,http://car.yiche.com/aodia3-3999/m126313/peizhi/ 2、通过mondb中的bitauto_series和bitauto_vehicle表分别取出车型拼音和车型id用于url拼接 3、将html页面转为lxml页面, 4、按分类替换不同分类的标签为表格标签,每次取一个分类,按行获取相应的参数值 5、使用replace替换页面中的空格、换行等无用信息,方便过滤内容 6、通过大标题,例如“基本信息,车身尺寸,娱乐配置”等分类将所有配置分为参数和配置两大类并根据id更新到mongodb中
vi get_yiche.py# -*- coding: utf-8 -*-
import scrapy
import requests
import re
from bs4 import BeautifulSoup as bs
import datetime
import time
import sys
import os
import pymongo
import xlsxwriter
import json
from scrapy_splash import SplashRequest
from scrapy.http import Request,FormRequest
from scrapy.selector import Selector
client = pymongo.MongoClient("172.23.210.21", 27017)
db = client.MonthlyTasks.get_collection('bitauto_series')
db1 = client.MonthlyTasks.get_collection('bitauto_vehicle')
now = datetime.datetime.now().strftime("%Y-%m-%d")
starturl_brand = 'https://www.bitauto.com'
class GetHtml(scrapy.Spider):
name = 'yiche'
allowed_domains = ["bitauto.com"]
start_usrls = ["http://car.yiche.com/aodia3-3999/m126313/peizhi/"]
def start_requests(self):
# 获取车型拼音,用于拼接出参数URL
seriesinfo = {}
items = db.find({"seriesid":{"$ne":None}})
for i in items:
seriesid = i['seriesid']
seriespinyin = i['seriespinyin']
seriesinfo.setdefault(seriesid,seriespinyin)
# 获取车型id,用于拼接出参数URL
vehicles = db1.find({"webstatus":"activated"}).batch_size(100)
count = 1
for vehicle in vehicles:
id = vehicle['_id']
print(count,id)
count = count+1
seriesid = vehicle['seriesid']
seriespinyin = seriesinfo[seriesid]
vehicleid = vehicle['vehicleid']
url = "http://car.yiche.com/%s/m%s/peizhi/"%(seriespinyin,vehicleid)
# print (url)
yield SplashRequest(url, endpoint='render.html', args={'wait': 1}, callback=self.parse, meta={'id':id}) # for循环过滤出的id需要传到parse函数中用于db.update,所以使用meta将其传递
def parse(self, response):
canshu = []
peizhi = []
id = response.meta['id'] # 获取上面for循环中传递过来的id值,用于db.update更新标记
html = response.body.decode("utf-8")
html = html.replace('<tr class="data-tr','<tr')
html = html.replace('<tr class="param-carInfo','</table><table><tr class="param-carInfo')+'</table>'
html = html.replace('<span>','')+''
html = html.replace('<span class="optional-item-price">','')+''
html = html.replace('<tr class="no-clone">','')+''
html = html.replace('<div class="div-in-td-content">','')+''
html = html.replace('<td><div class="div-in-td-content "','<td')+''
html = html.replace('<td rowspan="1"','<th')+'</th>'
html = html.replace('<td colspan="6">','')+''
html = html.replace('\n', '').replace('\r', '')
html = html.replace(' ', '')
soup = bs(html, 'lxml')
temp = soup.find_all('table')
for group in temp:
groupname = group.h3.text.strip()
paramitems = []
items = group.find_all('tr')
for item in items:
key = item.th
if key is not None:
key = key.text.strip()
else:
key = None
value = item.td.text.strip()
paramitems.append({"name":key,"value":value})
if '配置' in groupname or groupname == "信息娱乐":
peizhi.append({"name":groupname,"configitems":paramitems})
else:
canshu.append({"name":groupname,"paramitems":paramitems})
db1.update({"_id": id}, {"$set": {"参数": canshu, "配置": peizhi}})最后更新于