import scrapy, pdfkit, requests, random, time
class html_to_pdf(scrapy.Spider):
name = 'html_to_pdf'
start_urls = ['https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000']
# def get_menu():
# urls = 'https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000'
# headers = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36'
# response = requests.get(url=urls, headers=headers)
# menu = response.css('.x-wiki-index-item *::text').extract()
# filename = 'liaoxuefeng'
# for i in menu:
# with open(filename, 'a+') as f:
# f.write(i)
nb = 0
def parse(self, response):
if self.nb == 1:
menu = response.css('.x-wiki-index-item *::text').extract()
filename = 'liaoxuefeng'
for i in menu:
with open(filename, 'a+') as f:
f.write(i)
self.nb = 1
else:
pass
page = response.css('.x-wiki-content *::text').extract()
filename = 'liaoxuefeng'
for i in page:
with open(filename, 'a+') as f:
f.write(i)
# next_page = response.xpath("//div[@class='rst-footer-buttons']//@href").extract_first()
next_pages = response.xpath("//ul[@class='uk-nav uk-nav-side']//@href").extract()
if next_pages is not None:
for next_page in next_pages:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
import scrapy
class linux(scrapy.Spider):
name = 'get_linux'
start_urls = ['https://www.linuxprobe.com/chapter-00.html']
urls = [
'https://www.linuxprobe.com/chapter-01.html',
'https://www.linuxprobe.com/chapter-02.html',
'https://www.linuxprobe.com/chapter-03.html',
'https://www.linuxprobe.com/chapter-04.html',
'https://www.linuxprobe.com/chapter-05.html',
'https://www.linuxprobe.com/chapter-06.html',
'https://www.linuxprobe.com/chapter-07.html',
'https://www.linuxprobe.com/chapter-08.html',
'https://www.linuxprobe.com/chapter-09.html',
'https://www.linuxprobe.com/chapter-10.html',
'https://www.linuxprobe.com/chapter-11.html',
'https://www.linuxprobe.com/chapter-12.html',
'https://www.linuxprobe.com/chapter-13.html',
'https://www.linuxprobe.com/chapter-14.html',
'https://www.linuxprobe.com/chapter-15.html',
'https://www.linuxprobe.com/chapter-16.html',
'https://www.linuxprobe.com/chapter-17.html',
'https://www.linuxprobe.com/chapter-18.html',
'https://www.linuxprobe.com/chapter-19.html',
'https://www.linuxprobe.com/chapter-20.html'
]
def parse(self, response):
filename = 'linux'
#抓取指定标签中的文本内容,这里定义的是div 名称为section的标签,
title = response.css('.post-title *::text').extract_first()
with open(filename, 'a+') as f:
f.write(title + '\n')
page = response.css('.centent-article *::text').extract()[17:-145]
#将抓取到的文本依次写入指定文件中
for p in page:
with open(filename, 'a+') as f:
f.write(p)
'''
爬取下一页的内容 方法:scrapy.Request() 下一页的路径(链接):next_page,通过callback将链接交给处理的函数,也就是parse函数(即本函数,所以用了self.)
'''
next_page = self.urls.pop(0)
yield scrapy.Request(next_page, callback=self.parse)
import scrapy
class k8s(scrapy.Spider):
name = 'new_k8s'
start_urls = ['https://www.kubernetes.org.cn/docs']
filename = 'new_k8s_docs'
def parse(self, response):
'''parse用来获取url列表,然后通过for循环取值传递给get_doc来获取页面内容'''
urls = response.xpath("//div[@class='pageside']//@href").extract()[1:]
title = response.css('.pageside *::text').extract()[1]
with open(self.filename, 'a+') as f:
f.write(title + '\n')
# print(urls)
for url in urls:
yield scrapy.Request(url, callback=self.get_doc)
def get_doc(self, response):
'''获取页面内容并保存'''
page = response.css('.content *::text').extract()[1:-63]
for p in page:
with open(self.filename, 'a+') as f:
f.write(p)