import time from typing import Any import scrapy from scrapy.http import Response from loguru import logger # logger.remove(0) logger.add('pages.txt') class spider(scrapy.Spider): name = 'matty' base_url = 'https://www.biblegateway.com/resources/matthew-henry' start_urls = [f'{base_url}/toc'] custom_settings = { "AUTOTHROTTLE_ENABLED": True, # 'DOWNLOAD_DELAY': 2.5, } books = 0 chapters = 0 sections = 0 def parse(self, response: Response, **kwargs: Any) -> Any: self.crawler.stats.set_value('books', 0) self.crawler.stats.set_value('chapters', 0) self.crawler.stats.set_value('sections', 0) for a in response.css('ul.resource-list a'): # logger.info(a) self.crawler.stats.inc_value('books') yield response.follow(a, callback=self.parse_book) def parse_book(self, response: Response, **kwargs: Any) -> Any: # for a in response.copy().css('div.article a').getall(): # logger.info(a) for a in response.css('ul.resource-list a').getall(): if 'Chapter' in a: self.crawler.stats.inc_value('chapters') # logger.info(f"Chapter: {a}") # yield response.follow(a, callback=self.parse_chapter) if 'Verse' in a: self.crawler.stats.inc_value('sections') # logger.info(f"Verse: {a}") # yield response.follow(a, callback=self.parse_verses)