import scrapy
from mzik.items import MzikItem
class MzikSpider(scrapy.Spider):
name = 'mzikspider'
# domain = 'https://moozik.cn'
start_urls = ['https://moozik.cn/']
def parse(self, response):
# print('parse','='*20)
# print(dir(response))
'''
['_DEFAULT_ENCODING', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_auto_detect_fun', '_body', '_body_declared_encoding', '_body_inferred_encoding', '_cached_benc', '_cached_selector', '_cached_ubody', '_declared_encoding', '_encoding', '_get_body', '_get_url', '_headers_encoding', '_set_body', '_set_url', '_url', 'body', 'body_as_unicode', 'copy', 'css', 'encoding', 'flags', 'follow', 'headers', 'meta', 'replace', 'request', 'selector', 'status', 'text', 'url', 'urljoin', 'xpath']
'''
# exit()
for sel in response.xpath('//*[@id="aside"]/div/div[1]/nav/ul/li[11]/ul/li[position()>2]/a/@href').extract():
yield scrapy.Request(url=sel, callback=self.parse_page)
for sel in response.xpath('//*[@id="aside"]/div/div[1]/nav/ul/li[11]/ul/li[position()=2]/ul/li/a/@href').extract():
yield scrapy.Request(url=sel, callback=self.parse_page)
# print('parse','='*20)
def parse_page(self, response):
# print('parse_page','='*20)
# yield self.parse_list(response)
# yield scrapy.Request(url=response.url, callback=self.parse_list)
limit = response.xpath('//ol[@class="page-navigator"]/li/a/@href').extract()
if limit == None:
limit = 1
else:
limit = len(limit)
print(limit)
for i in range(1, limit):
yield scrapy.Request(url=response.url+'{}/'.format(i), callback=self.parse_list)
# print('parse_page','='*20)
def parse_list(self, response):
#print(response.xpath('//div[@class="blog-post"]/div/div/h2/a/@href').extract_first())
if response.status != 200:
return
if response.xpath('//div[@class="blog-post"]/div/div/h2/a/@href').extract_first() == None:
return
# print('parse_list','='*20)
for sel in response.xpath('//div[@class="blog-post"]/div/div/h2/a/@href').extract():
yield scrapy.Request(url=sel, callback=self.parse_goodspage)
# print('parse_list','='*20)
def parse_goodspage(self, response):
item = MzikItem()
item['link'] = response.url
item['title'] = response.xpath('//title/text()').extract_first()
item['author'] = response.xpath('//li[@class="meta-author"]/a/text()').extract_first()
item['date'] = response.xpath('//li[@class="meta-date"]/time/text()').extract_first()
item['view'] = response.xpath('//li[@class="meta-views"]/span[2]/text()').extract_first()
item['comment'] = response.xpath('//li[@class="meta-comments"]/a/text()').extract_first()
item['wordcount'] = response.xpath('//li[@class="meta-word"]/span[2]/text()').extract_first()
item['categories'] = response.xpath('//li[@class="meta-categories"]//a[last()]/text()').extract_first()
yield item