Login light
import scrapy
from mzik.items import MzikItem
class MzikSpider(scrapy.Spider):
    name = 'mzikspider'
    # domain = 'https://moozik.cn'
    start_urls = ['https://moozik.cn/']

    def parse(self, response):
        # print('parse','='*20)
        # print(dir(response))
        '''
        ['_DEFAULT_ENCODING', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_auto_detect_fun', '_body', '_body_declared_encoding', '_body_inferred_encoding', '_cached_benc', '_cached_selector', '_cached_ubody', '_declared_encoding', '_encoding', '_get_body', '_get_url', '_headers_encoding', '_set_body', '_set_url', '_url', 'body', 'body_as_unicode', 'copy', 'css', 'encoding', 'flags', 'follow', 'headers', 'meta', 'replace', 'request', 'selector', 'status', 'text', 'url', 'urljoin', 'xpath']
        '''
        # exit()
        for sel in response.xpath('//*[@id="aside"]/div/div[1]/nav/ul/li[11]/ul/li[position()>2]/a/@href').extract():
            yield scrapy.Request(url=sel, callback=self.parse_page) 
        for sel in response.xpath('//*[@id="aside"]/div/div[1]/nav/ul/li[11]/ul/li[position()=2]/ul/li/a/@href').extract():
            yield scrapy.Request(url=sel, callback=self.parse_page)
        # print('parse','='*20)
    
    def parse_page(self, response):
        # print('parse_page','='*20)
        # yield self.parse_list(response)
        # yield scrapy.Request(url=response.url, callback=self.parse_list)
        limit = response.xpath('//ol[@class="page-navigator"]/li/a/@href').extract()
        if limit == None:
            limit = 1
        else:
            limit = len(limit)
        print(limit)
        for i in range(1, limit):
            yield scrapy.Request(url=response.url+'{}/'.format(i), callback=self.parse_list) 
        # print('parse_page','='*20)

    def parse_list(self, response):
        #print(response.xpath('//div[@class="blog-post"]/div/div/h2/a/@href').extract_first())
        if response.status != 200:
            return
        if response.xpath('//div[@class="blog-post"]/div/div/h2/a/@href').extract_first() == None:
            return
        # print('parse_list','='*20)
        for sel in response.xpath('//div[@class="blog-post"]/div/div/h2/a/@href').extract():
            yield scrapy.Request(url=sel, callback=self.parse_goodspage) 
        # print('parse_list','='*20)
        
    def parse_goodspage(self, response):
        item = MzikItem()
        item['link'] = response.url
        item['title'] = response.xpath('//title/text()').extract_first()
        item['author'] = response.xpath('//li[@class="meta-author"]/a/text()').extract_first()
        item['date'] = response.xpath('//li[@class="meta-date"]/time/text()').extract_first()
        item['view'] = response.xpath('//li[@class="meta-views"]/span[2]/text()').extract_first()
        item['comment'] = response.xpath('//li[@class="meta-comments"]/a/text()').extract_first()
        item['wordcount'] = response.xpath('//li[@class="meta-word"]/span[2]/text()').extract_first()
        item['categories'] = response.xpath('//li[@class="meta-categories"]//a[last()]/text()').extract_first()
        yield item