抓取易车车型配置小记

因为易车网车型配置信息为动态加载,导致之前用于抓取参数的python脚本失效,根据原脚本结合scrapy和splash来进行参数采集

splash:用于需要js动态加载的页面,渲染动态js 先启动splash,官方提供有现成镜像,可以直接拿来使用

docker run -d --name=splash --restart=always -p 8050:8050 scrapinghub/splash:latest

然后新建一个scrapy项目

scrapy startproject yiche
cd yiche

修改scrapy的配置

vi yiche/setting
BOT_NAME = 'yiche'
SPIDER_MODULES = ['yiche.spiders']
NEWSPIDER_MODULE = 'yiche.spiders'
ROBOTSTXT_OBEY = False	# 是否遵守robot.txt
DOWNLOAD_DELAY = 1	# 抓取速度,防止被服务器封禁,适当降低抓取速度

##### splash 配置 开始#####
SPIDER_MIDDLEWARES = {
   'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
DOWNLOADER_MIDDLEWARES = {
	'scrapy_splash.SplashCookiesMiddleware': 723,
	'scrapy_splash.SplashMiddleware': 725,
	'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
SPLASH_URL = 'http://localhost:8050/'  # splash地址,我直接在本机运行的容器,所以地址为localhost
DUPEFILTER_CLASS = "scrapy_splash.SplashAwareDupeFilter"
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
##### splash 配置 结束#####

大体思路 1、主爬虫脚本内容,抓取页面用的是车型对面页面渲染的配置列表,http://car.yiche.com/aodia3-3999/m126313/peizhi/ 2、通过mondb中的bitauto_series和bitauto_vehicle表分别取出车型拼音和车型id用于url拼接 3、将html页面转为lxml页面, 4、按分类替换不同分类的标签为表格标签,每次取一个分类,按行获取相应的参数值 5、使用replace替换页面中的空格、换行等无用信息,方便过滤内容 6、通过大标题,例如“基本信息,车身尺寸,娱乐配置”等分类将所有配置分为参数和配置两大类并根据id更新到mongodb中

vi get_yiche.py
# -*- coding: utf-8 -*-

import scrapy
import requests
import re
from bs4 import BeautifulSoup as bs
import datetime
import time
import sys
import os
import pymongo
import xlsxwriter
import json
from scrapy_splash import SplashRequest
from scrapy.http import Request,FormRequest
from scrapy.selector import Selector

client = pymongo.MongoClient("172.23.210.21", 27017)
db = client.MonthlyTasks.get_collection('bitauto_series')
db1 = client.MonthlyTasks.get_collection('bitauto_vehicle')

now = datetime.datetime.now().strftime("%Y-%m-%d")
starturl_brand = 'https://www.bitauto.com'

class GetHtml(scrapy.Spider):
	name = 'yiche'
	allowed_domains = ["bitauto.com"]
	start_usrls = ["http://car.yiche.com/aodia3-3999/m126313/peizhi/"]

	def start_requests(self):

		# 获取车型拼音,用于拼接出参数URL
		seriesinfo = {}
		items = db.find({"seriesid":{"$ne":None}})
		for i in items:
			seriesid = i['seriesid']
			seriespinyin = i['seriespinyin']
			seriesinfo.setdefault(seriesid,seriespinyin)

		# 获取车型id,用于拼接出参数URL
		vehicles = db1.find({"webstatus":"activated"}).batch_size(100)
		count = 1
		for vehicle in vehicles:
			id = vehicle['_id']
			print(count,id)
			count = count+1
			seriesid = vehicle['seriesid']
			seriespinyin = seriesinfo[seriesid]
			vehicleid = vehicle['vehicleid']

			url = "http://car.yiche.com/%s/m%s/peizhi/"%(seriespinyin,vehicleid)
			# print (url)
			yield SplashRequest(url, endpoint='render.html', args={'wait': 1}, callback=self.parse, meta={'id':id})	# for循环过滤出的id需要传到parse函数中用于db.update,所以使用meta将其传递


	def parse(self, response):

		canshu = []
		peizhi = []
		id = response.meta['id']	# 获取上面for循环中传递过来的id值,用于db.update更新标记
		html = response.body.decode("utf-8")
		html = html.replace('<tr class="data-tr','<tr')
		html = html.replace('<tr class="param-carInfo','</table><table><tr class="param-carInfo')+'</table>'
		html = html.replace('<span>','')+''
		html = html.replace('<span class="optional-item-price">','')+''
		html = html.replace('<tr class="no-clone">','')+''
		html = html.replace('<div class="div-in-td-content">','')+''
		html = html.replace('<td><div class="div-in-td-content "','<td')+''
		html = html.replace('<td rowspan="1"','<th')+'</th>'
		html = html.replace('<td colspan="6">','')+''
		html = html.replace('\n', '').replace('\r', '')
		html = html.replace(' ', '')
		soup = bs(html, 'lxml')
		temp = soup.find_all('table')
		for group in temp:
			groupname = group.h3.text.strip()
			paramitems = []
			items = group.find_all('tr')
			for item in items:
				key = item.th
				if key is not None:
					key = key.text.strip()
				else:
					key = None
				value = item.td.text.strip()

				paramitems.append({"name":key,"value":value})
			if '配置' in groupname or groupname == "信息娱乐":
				peizhi.append({"name":groupname,"configitems":paramitems})
			else:
				canshu.append({"name":groupname,"paramitems":paramitems})
		db1.update({"_id": id}, {"$set": {"参数": canshu, "配置": peizhi}})

最后更新于