scrapy_splash抓取动态页面测试
# -*- coding: utf-8 -*-
import scrapy
import requests
import random
import sys
from scrapy_splash import SplashRequest
class wangyi(scrapy.Spider):
name = 'get_163'
def __init__(self):
self.page_number = list(range(1, 1))
self.header = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36"
}
def start_requests(self):
'''列出有相册的页面,通过range定义一个页面范围'''
start_usls = 'http://photo.163.com/mtzhuxuebu/#m=0&p='
page_number = list(range(1, 2))
for i in page_number:
full_url = start_usls + str(i)
yield SplashRequest(url=full_url, headers=self.header, args={"wait": 3}, callback=self.parse)
def parse(self, response):
'''过滤出每个相册的访问链接和每个相册每一页的链接,并交给parseList来处理'''
totalUrl = response.xpath("//div[@class='ln ln0']//@href").extract()
list_urls = []
for i in totalUrl:
list_urls.append(i)
left_url = 'http://photo.163.com/mtzhuxuebu/'
'''过滤出的每个相册访问链接其实就是该相册第一页的访问链接,
例如 #m=1&aid=303448226&p=1 最后一个数字1表面该相册的页数,
但相册内容可能会有多页,所以我就再通过for循环先去掉最后一个数字,
然后从1-16中依次取出一个用来拼接出其它页面。然后交给parseList处理'''
for value in list_urls:
list_url = left_url + value
yield SplashRequest(url=list_url, headers=self.header, args={"wait": 3}, callback=self.parseList)
def parseList(self, response):
'''过滤出相册页面中每张照片详情的访问链接'''
photo_urls = response.xpath("//div[@class='ln ln0']//@href").extract()
photo_url = []
left_url = 'http://photo.163.com/mtzhuxuebu/'
for i in photo_urls:
photo_url.append(i)
for value in photo_url:
url = left_url + value
yield SplashRequest(url=url, headers=self.header, args={"wait": 3}, callback=self.GetPhotoAdd)
with open('url-list.txt', 'a') as f:
f.write(url + '\n')
def GetPhotoAdd(self, response):
'''过滤出每张大图的原图下载链接'''
getphotoaddlists = response.xpath("//a[@class='ztag201008041230345 menu']//@href").extract()[-1]
getphotodesc = response.css('.photo-desc *::text').extract()
# yield SplashRequest(url=getphotoaddlists, headers=self.header, args={"wait": 5})
with open('list-new.txt', 'a') as file_object:
file_object.write(str(getphotodesc) + '\t')
file_object.write(getphotoaddlists)
file_object.write('\n')最后更新于