# -*- coding: utf-8 -*-
import scrapy
import requests
import re
from bs4 import BeautifulSoup as bs
import datetime
import time
import sys
import os
import pymongo
import xlsxwriter
import json
from scrapy_splash import SplashRequest
from scrapy.http import Request,FormRequest
from scrapy.selector import Selector
client = pymongo.MongoClient("172.23.210.21", 27017)
db = client.MonthlyTasks.get_collection('bitauto_series')
db1 = client.MonthlyTasks.get_collection('bitauto_vehicle')
now = datetime.datetime.now().strftime("%Y-%m-%d")
starturl_brand = 'https://www.bitauto.com'
class GetHtml(scrapy.Spider):
name = 'yiche'
allowed_domains = ["bitauto.com"]
start_usrls = ["http://car.yiche.com/aodia3-3999/m126313/peizhi/"]
def start_requests(self):
# 获取车型拼音,用于拼接出参数URL
seriesinfo = {}
items = db.find({"seriesid":{"$ne":None}})
for i in items:
seriesid = i['seriesid']
seriespinyin = i['seriespinyin']
seriesinfo.setdefault(seriesid,seriespinyin)
# 获取车型id,用于拼接出参数URL
vehicles = db1.find({"webstatus":"activated"}).batch_size(100)
count = 1
for vehicle in vehicles:
id = vehicle['_id']
print(count,id)
count = count+1
seriesid = vehicle['seriesid']
seriespinyin = seriesinfo[seriesid]
vehicleid = vehicle['vehicleid']
url = "http://car.yiche.com/%s/m%s/peizhi/"%(seriespinyin,vehicleid)
# print (url)
yield SplashRequest(url, endpoint='render.html', args={'wait': 1}, callback=self.parse, meta={'id':id}) # for循环过滤出的id需要传到parse函数中用于db.update,所以使用meta将其传递
def parse(self, response):
canshu = []
peizhi = []
id = response.meta['id'] # 获取上面for循环中传递过来的id值,用于db.update更新标记
html = response.body.decode("utf-8")
html = html.replace('<tr class="data-tr','<tr')
html = html.replace('<tr class="param-carInfo','</table><table><tr class="param-carInfo')+'</table>'
html = html.replace('<span>','')+''
html = html.replace('<span class="optional-item-price">','')+''
html = html.replace('<tr class="no-clone">','')+''
html = html.replace('<div class="div-in-td-content">','')+''
html = html.replace('<td><div class="div-in-td-content "','<td')+''
html = html.replace('<td rowspan="1"','<th')+'</th>'
html = html.replace('<td colspan="6">','')+''
html = html.replace('\n', '').replace('\r', '')
html = html.replace(' ', '')
soup = bs(html, 'lxml')
temp = soup.find_all('table')
for group in temp:
groupname = group.h3.text.strip()
paramitems = []
items = group.find_all('tr')
for item in items:
key = item.th
if key is not None:
key = key.text.strip()
else:
key = None
value = item.td.text.strip()
paramitems.append({"name":key,"value":value})
if '配置' in groupname or groupname == "信息娱乐":
peizhi.append({"name":groupname,"configitems":paramitems})
else:
canshu.append({"name":groupname,"paramitems":paramitems})
db1.update({"_id": id}, {"$set": {"参数": canshu, "配置": peizhi}})