import time from typing import Any from pathlib import Path import scrapy from scrapy.http import Response from loguru import logger logger.remove(0) # logger.add('pages.txt') class spider(scrapy.Spider): name = 'matty' base_folder = Path('./scraps/').absolute() # allowed_domains = ['https://www.biblegateway.com/'] base_url = 'https://www.biblegateway.com/resources/matthew-henry' start_urls = [f'{base_url}/toc'] custom_settings = { # "AUTOTHROTTLE_ENABLED": True, # 'DOWNLOAD_DELAY': 1.5, } def parse(self, response: Response, **kwargs: Any) -> Any: self.crawler.stats.set_value('books', 0) self.crawler.stats.set_value('chapters', 0) self.crawler.stats.set_value('sections', 0) for a in response.css('ul.resource-list a'): # logger.info(a) self.crawler.stats.inc_value('books') yield response.follow(a, callback=self.parse_book) def parse_book(self, response: Response, **kwargs: Any) -> Any: # for a in response.copy().css('div.article a').getall(): # logger.info(a) # book = response.url.split(r'/')[-1] for a in response.css('ul.resource-list a'): logger.debug(a) # if 'Chapter' in a: self.crawler.stats.inc_value('chapters') # logger.info(f"Chapter: {a}") if 'CHAPTER' in a.css('::text')[0].get().upper(): yield response.follow(a, callback=self.get_chapter) else: yield response.follow(a, callback=self.get_data) # break # if 'Verse' in a: # self.crawler.stats.inc_value('sections') # logger.info(f"Verse: {a}") # yield response.follow(a, callback=self.parse_verses) file_name = response.url.split('/')[-1].replace('.', '_') + '.html' logger.warning(file_name) data = response.css('div.article p').getall() self.write_data(file_name, data) def get_data(self, response: Response, **kwargs: Any) -> Any: file_name = response.url.split('/')[-1].replace('.', '_') + '.html' logger.warning(file_name) data = response.css('div.article p').getall() self.write_data(file_name, data) def get_chapter(self, response: Response, **kwargs: Any) -> Any: file_name = response.url.split('/')[-1].replace('.', '_') + '.html' file_name = f'chapter_{file_name}' logger.warning(file_name) data = response.css('div.article p').getall() self.write_data(file_name, data) def write_data(self, filename, data): file = self.base_folder / filename file.parent.mkdir(exist_ok=True, parents=True) with open(file, 'w+') as f: for d in data: f.write(d) f.write('\n')