PythonScrapy部分用例
1、抓取Django 的2.0中文文档
开始页面:https://docs.djangoproject.com/zh-hans/2.0/ 关键标签:要抓取的所以文本都在名为 section 的div中。 下一页标签:在名为right 的div中
import scrapy
class django(scrapy.Spider):
name = 'get_django'
start_urls = ['https://docs.djangoproject.com/zh-hans/2.0/']
def parse(self, response):
#抓取指定标签中的文本内容,这里定义的是div 名称为section的标签,
page = response.css('.section *::text').extract()
filename = 'first_page'
#将抓取到的文本依次写入指定文件中
for p in page:
with open(filename, 'a+') as f:
f.write(p)
#判断是否有下一页,如果有,过滤出下一页的路径
next_page = response.xpath("//div[@class='right']//@href").extract_first()
'''
将过滤出来的路径(相对路径)通过urljoin进行拼接,例如获取到的是/intro路径,
加上start_usrls后next_page就是https://docs.djangoproject.com/zh-hans/2.0/intro/
'''
if next_page is not None:
next_page = response.urljoin(next_page)
'''
爬取下一页的内容 方法:scrapy.Request() 下一页的路径(链接):next_page,通过callback将链接交给处理的函数,也就是parse函数(即本函数,所以用了self.)
'''
yield scrapy.Request(next_page, callback=self.parse)
'''
抓取本网站时会出现:"DEBUG: Forbidden by robots.txt......",这是因为爬取时网站的robot.txt阻止了部分页面不让爬虫进行抓取。
在setting.py中将ROBOTSTXT_OBEY=True 的值改为Flase即可
'''2、抓取瘳雪峰文档
import scrapy, pdfkit, requests, random, time
class html_to_pdf(scrapy.Spider):
name = 'html_to_pdf'
start_urls = ['https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000']
# def get_menu():
# urls = 'https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000'
# headers = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36'
# response = requests.get(url=urls, headers=headers)
# menu = response.css('.x-wiki-index-item *::text').extract()
# filename = 'liaoxuefeng'
# for i in menu:
# with open(filename, 'a+') as f:
# f.write(i)
nb = 0
def parse(self, response):
if self.nb == 1:
menu = response.css('.x-wiki-index-item *::text').extract()
filename = 'liaoxuefeng'
for i in menu:
with open(filename, 'a+') as f:
f.write(i)
self.nb = 1
else:
pass
page = response.css('.x-wiki-content *::text').extract()
filename = 'liaoxuefeng'
for i in page:
with open(filename, 'a+') as f:
f.write(i)
# next_page = response.xpath("//div[@class='rst-footer-buttons']//@href").extract_first()
next_pages = response.xpath("//ul[@class='uk-nav uk-nav-side']//@href").extract()
if next_pages is not None:
for next_page in next_pages:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)3、抓取《Linux就该这么学》
import scrapy
class linux(scrapy.Spider):
name = 'get_linux'
start_urls = ['https://www.linuxprobe.com/chapter-00.html']
urls = [
'https://www.linuxprobe.com/chapter-01.html',
'https://www.linuxprobe.com/chapter-02.html',
'https://www.linuxprobe.com/chapter-03.html',
'https://www.linuxprobe.com/chapter-04.html',
'https://www.linuxprobe.com/chapter-05.html',
'https://www.linuxprobe.com/chapter-06.html',
'https://www.linuxprobe.com/chapter-07.html',
'https://www.linuxprobe.com/chapter-08.html',
'https://www.linuxprobe.com/chapter-09.html',
'https://www.linuxprobe.com/chapter-10.html',
'https://www.linuxprobe.com/chapter-11.html',
'https://www.linuxprobe.com/chapter-12.html',
'https://www.linuxprobe.com/chapter-13.html',
'https://www.linuxprobe.com/chapter-14.html',
'https://www.linuxprobe.com/chapter-15.html',
'https://www.linuxprobe.com/chapter-16.html',
'https://www.linuxprobe.com/chapter-17.html',
'https://www.linuxprobe.com/chapter-18.html',
'https://www.linuxprobe.com/chapter-19.html',
'https://www.linuxprobe.com/chapter-20.html'
]
def parse(self, response):
filename = 'linux'
#抓取指定标签中的文本内容,这里定义的是div 名称为section的标签,
title = response.css('.post-title *::text').extract_first()
with open(filename, 'a+') as f:
f.write(title + '\n')
page = response.css('.centent-article *::text').extract()[17:-145]
#将抓取到的文本依次写入指定文件中
for p in page:
with open(filename, 'a+') as f:
f.write(p)
'''
爬取下一页的内容 方法:scrapy.Request() 下一页的路径(链接):next_page,通过callback将链接交给处理的函数,也就是parse函数(即本函数,所以用了self.)
'''
next_page = self.urls.pop(0)
yield scrapy.Request(next_page, callback=self.parse)4、抓取K8S文档,两种方法对比:
方法一:需要提前将要抓取页面的URL写好,不够灵活
import scrapy
class k8s(scrapy.Spider):
name = 'get_k8s'
start_urls = ['https://www.kubernetes.org.cn/docs']
n = 1
urls = [
'https://www.kubernetes.org.cn/k8s', 'https://www.kubernetes.org.cn/kubernetes%e8%ae%be%e8%ae%a1%e6%9e%b6%e6%9e%84', 'https://www.kubernetes.org.cn/kubernetes%e8%ae%be%e8%ae%a1%e7%90%86%e5%bf%b5', 'https://www.kubernetes.org.cn/doc-4', 'https://www.kubernetes.org.cn/doc-5', 'https://www.kubernetes.org.cn/doc-6', 'https://www.kubernetes.org.cn/doc-7', 'https://www.kubernetes.org.cn/doc-8', 'https://www.kubernetes.org.cn/doc-9', 'https://www.kubernetes.org.cn/doc-10', 'https://www.kubernetes.org.cn/doc-11', 'https://www.kubernetes.org.cn/doc-12', 'https://www.kubernetes.org.cn/doc-13', 'https://www.kubernetes.org.cn/doc-14', 'https://www.kubernetes.org.cn/doc-15', 'https://www.kubernetes.org.cn/doc-16', 'https://www.kubernetes.org.cn/doc-17', 'https://www.kubernetes.org.cn/doc-18', 'https://www.kubernetes.org.cn/doc-19', 'https://www.kubernetes.org.cn/kubernetes-pod', 'https://www.kubernetes.org.cn/kubernetes-labels', 'https://www.kubernetes.org.cn/%e5%90%8d%e8%af%8d%e8%a7%a3%e9%87%8a%ef%bc%9anamespace', 'https://www.kubernetes.org.cn/replication-controller-kubernetes', 'https://www.kubernetes.org.cn/%e5%90%8d%e8%af%8d%e8%a7%a3%e9%87%8a%ef%bc%9anode', 'https://www.kubernetes.org.cn/replicasets', 'https://www.kubernetes.org.cn/kubernetes-services', 'https://www.kubernetes.org.cn/kubernetes-volumes', 'https://www.kubernetes.org.cn/pvpvcstorageclass', 'https://www.kubernetes.org.cn/deployment', 'https://www.kubernetes.org.cn/secret', 'https://www.kubernetes.org.cn/statefulset', 'https://www.kubernetes.org.cn/daemonset', 'https://www.kubernetes.org.cn/service-account', 'https://www.kubernetes.org.cn/cronjob', 'https://www.kubernetes.org.cn/job', 'https://www.kubernetes.org.cn/security-context-psp', 'https://www.kubernetes.org.cn/resource-quotas', 'https://www.kubernetes.org.cn/network-policy', 'https://www.kubernetes.org.cn/ingress', 'https://www.kubernetes.org.cn/thirdpartyresources', 'https://www.kubernetes.org.cn/configmap', 'https://www.kubernetes.org.cn/podpreset', 'https://www.kubernetes.org.cn/doc-25', 'https://www.kubernetes.org.cn/doc-26', 'https://www.kubernetes.org.cn/horizontal-pod-autoscaling', 'https://www.kubernetes.org.cn/doc-27', 'https://www.kubernetes.org.cn/doc-28', 'https://www.kubernetes.org.cn/doc-29', 'https://www.kubernetes.org.cn/doc-30', 'https://www.kubernetes.org.cn/doc-31', 'https://www.kubernetes.org.cn/doc-32', 'https://www.kubernetes.org.cn/doc-33', 'https://www.kubernetes.org.cn/doc-34', 'https://www.kubernetes.org.cn/doc-35', 'https://www.kubernetes.org.cn/doc-36', 'https://www.kubernetes.org.cn/doc-37', 'https://www.kubernetes.org.cn/doc-38', 'https://www.kubernetes.org.cn/doc-39', 'https://www.kubernetes.org.cn/doc-40', 'https://www.kubernetes.org.cn/doc-41', 'https://www.kubernetes.org.cn/doc-42', 'https://www.kubernetes.org.cn/doc-43', 'https://www.kubernetes.org.cn/doc-44', 'https://www.kubernetes.org.cn/doc-45', 'https://www.kubernetes.org.cn/installkubectl', 'https://www.kubernetes.org.cn/doc-46', 'https://www.kubernetes.org.cn/doc-47', 'https://www.kubernetes.org.cn/doc-48', 'https://www.kubernetes.org.cn/doc-49', 'https://www.kubernetes.org.cn/doc-50', 'https://www.kubernetes.org.cn/doc-51', 'https://www.kubernetes.org.cn/doc-52', 'https://www.kubernetes.org.cn/doc-53', 'https://www.kubernetes.org.cn/doc-54', 'https://www.kubernetes.org.cn/doc-55', 'https://www.kubernetes.org.cn/doc-56', 'https://www.kubernetes.org.cn/doc-57', 'https://www.kubernetes.org.cn/doc-58', 'https://www.kubernetes.org.cn/doc-59', 'https://www.kubernetes.org.cn/doc-60', 'https://www.kubernetes.org.cn/doc-61', 'https://www.kubernetes.org.cn/doc-62', 'https://www.kubernetes.org.cn/doc-63', 'https://www.kubernetes.org.cn/doc-64', 'https://www.kubernetes.org.cn/doc-65', 'https://www.kubernetes.org.cn/doc-66', 'https://www.kubernetes.org.cn/doc-67'
]
def parse(self, response):
urls = response.xpath("//div[@class='pageside']//@href").extract()[1:]
filename = 'k8s_docs'
#抓取指定标签中的文本内容,这里定义的是div 名称为section的标签,
if self.n == 1:
title = response.css('.pageside *::text').extract()[1]
with open(filename, 'a+') as f:
f.write(title + '\n')
self.n =0
else:
pass
page = response.css('.content *::text').extract()[1:-63]
#将抓取到的文本依次写入指定文件中
for p in page:
with open(filename, 'a+') as f:
f.write(p)
'''
爬取下一页的内容 方法:scrapy.Request() 下一页的路径(链接):next_page,通过callback将链接交给处理的函数,也就是parse函数(即本函数,所以用了self.)
'''
next_page = self.urls.pop(0)
if next_page:
yield scrapy.Request(next_page, callback=self.parse)
else:
pass方法二:代码更少更简洁
import scrapy
class k8s(scrapy.Spider):
name = 'new_k8s'
start_urls = ['https://www.kubernetes.org.cn/docs']
filename = 'new_k8s_docs'
def parse(self, response):
'''parse用来获取url列表,然后通过for循环取值传递给get_doc来获取页面内容'''
urls = response.xpath("//div[@class='pageside']//@href").extract()[1:]
title = response.css('.pageside *::text').extract()[1]
with open(self.filename, 'a+') as f:
f.write(title + '\n')
# print(urls)
for url in urls:
yield scrapy.Request(url, callback=self.get_doc)
def get_doc(self, response):
'''获取页面内容并保存'''
page = response.css('.content *::text').extract()[1:-63]
for p in page:
with open(self.filename, 'a+') as f:
f.write(p)最后更新于