diff --git a/main.py b/main.py index 2a1756a..15ad8ee 100644 --- a/main.py +++ b/main.py @@ -381,10 +381,16 @@ class MyHTMLParser(HTMLParser): logger.debug(self.passage_verse) raise - self.write_to_file('## Verses: ') - self.write_to_file( - f'{verse.start_verse} - {verse.end_verse}' - ) + if verse.start_verse == verse.end_verse: + self.write_to_file('## Verse: ') + self.write_to_file( + f'{verse.start_verse}' + ) + else: + self.write_to_file('## Verses: ') + self.write_to_file( + f'{verse.start_verse} - {verse.end_verse}' + ) self.write_to_file('\n\n') # Need this do the regex in self.clean does not pick diff --git a/matty.py b/matty.py new file mode 100644 index 0000000..e57c878 --- /dev/null +++ b/matty.py @@ -0,0 +1,52 @@ +import time +from typing import Any + +import scrapy +from scrapy.http import Response + +from loguru import logger + +# logger.remove(0) +logger.add('pages.txt') + + +class spider(scrapy.Spider): + name = 'matty' + base_url = 'https://www.biblegateway.com/resources/matthew-henry' + start_urls = [f'{base_url}/toc'] + + custom_settings = { + "AUTOTHROTTLE_ENABLED": True, + # 'DOWNLOAD_DELAY': 2.5, + } + + books = 0 + chapters = 0 + sections = 0 + + + def parse(self, response: Response, **kwargs: Any) -> Any: + self.crawler.stats.set_value('books', 0) + self.crawler.stats.set_value('chapters', 0) + self.crawler.stats.set_value('sections', 0) + + for a in response.css('ul.resource-list a'): + # logger.info(a) + self.crawler.stats.inc_value('books') + yield response.follow(a, callback=self.parse_book) + + def parse_book(self, response: Response, **kwargs: Any) -> Any: + # for a in response.copy().css('div.article a').getall(): + # logger.info(a) + + for a in response.css('ul.resource-list a').getall(): + if 'Chapter' in a: + self.crawler.stats.inc_value('chapters') + # logger.info(f"Chapter: {a}") + # yield response.follow(a, callback=self.parse_chapter) + + if 'Verse' in a: + self.crawler.stats.inc_value('sections') + # logger.info(f"Verse: {a}") + # yield response.follow(a, callback=self.parse_verses) +