scrapy_splash抓取动态页面测试

	# -*- coding: utf-8 -*-
	import scrapy
	import requests
	import random
	import sys
	from scrapy_splash import SplashRequest


	class wangyi(scrapy.Spider):
		name = 'get_163'

		def __init__(self):
			self.page_number = list(range(1, 1))
			self.header = {
				"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36"
				}

		def start_requests(self):
			'''列出有相册的页面,通过range定义一个页面范围'''
			start_usls = 'http://photo.163.com/mtzhuxuebu/#m=0&p='
			page_number = list(range(1, 2))

			for i in page_number:
				full_url = start_usls + str(i)
				yield SplashRequest(url=full_url, headers=self.header, args={"wait": 3}, callback=self.parse)

		def parse(self, response):
			'''过滤出每个相册的访问链接和每个相册每一页的链接,并交给parseList来处理'''
			totalUrl = response.xpath("//div[@class='ln ln0']//@href").extract()

			list_urls = []
			for i in totalUrl:
				list_urls.append(i)
			left_url = 'http://photo.163.com/mtzhuxuebu/'

			'''过滤出的每个相册访问链接其实就是该相册第一页的访问链接,
			例如 #m=1&aid=303448226&p=1  最后一个数字1表面该相册的页数,
			但相册内容可能会有多页,所以我就再通过for循环先去掉最后一个数字,
			然后从1-16中依次取出一个用来拼接出其它页面。然后交给parseList处理'''
			for value in list_urls:
				list_url = left_url + value
				yield SplashRequest(url=list_url, headers=self.header, args={"wait": 3}, callback=self.parseList)

		def parseList(self, response):
			'''过滤出相册页面中每张照片详情的访问链接'''
			photo_urls = response.xpath("//div[@class='ln ln0']//@href").extract()
			photo_url = []
			left_url = 'http://photo.163.com/mtzhuxuebu/'
			for i in photo_urls:
				photo_url.append(i)
			for value in photo_url:
				url = left_url + value
				yield SplashRequest(url=url, headers=self.header, args={"wait": 3}, callback=self.GetPhotoAdd)
				with open('url-list.txt', 'a') as f:
					f.write(url + '\n')

		def GetPhotoAdd(self, response):
			'''过滤出每张大图的原图下载链接'''
			getphotoaddlists = response.xpath("//a[@class='ztag201008041230345 menu']//@href").extract()[-1]
			getphotodesc = response.css('.photo-desc *::text').extract()
			# yield SplashRequest(url=getphotoaddlists, headers=self.header, args={"wait": 5})
			with open('list-new.txt', 'a') as file_object:
				file_object.write(str(getphotodesc) + '\t')
				file_object.write(getphotoaddlists)
				file_object.write('\n')

最后更新于