This commit is contained in:
boose_magoose 2023-12-10 21:34:29 -05:00
parent 9997abcbe6
commit cc53817dd8
2 changed files with 62 additions and 4 deletions

14
main.py
View File

@ -381,10 +381,16 @@ class MyHTMLParser(HTMLParser):
logger.debug(self.passage_verse)
raise
self.write_to_file('## Verses: ')
self.write_to_file(
f'{verse.start_verse} - {verse.end_verse}'
)
if verse.start_verse == verse.end_verse:
self.write_to_file('## Verse: ')
self.write_to_file(
f'{verse.start_verse}'
)
else:
self.write_to_file('## Verses: ')
self.write_to_file(
f'{verse.start_verse} - {verse.end_verse}'
)
self.write_to_file('\n\n')
# Need this do the regex in self.clean does not pick

52
matty.py Normal file
View File

@ -0,0 +1,52 @@
import time
from typing import Any
import scrapy
from scrapy.http import Response
from loguru import logger
# logger.remove(0)
logger.add('pages.txt')
class spider(scrapy.Spider):
name = 'matty'
base_url = 'https://www.biblegateway.com/resources/matthew-henry'
start_urls = [f'{base_url}/toc']
custom_settings = {
"AUTOTHROTTLE_ENABLED": True,
# 'DOWNLOAD_DELAY': 2.5,
}
books = 0
chapters = 0
sections = 0
def parse(self, response: Response, **kwargs: Any) -> Any:
self.crawler.stats.set_value('books', 0)
self.crawler.stats.set_value('chapters', 0)
self.crawler.stats.set_value('sections', 0)
for a in response.css('ul.resource-list a'):
# logger.info(a)
self.crawler.stats.inc_value('books')
yield response.follow(a, callback=self.parse_book)
def parse_book(self, response: Response, **kwargs: Any) -> Any:
# for a in response.copy().css('div.article a').getall():
# logger.info(a)
for a in response.css('ul.resource-list a').getall():
if 'Chapter' in a:
self.crawler.stats.inc_value('chapters')
# logger.info(f"Chapter: {a}")
# yield response.follow(a, callback=self.parse_chapter)
if 'Verse' in a:
self.crawler.stats.inc_value('sections')
# logger.info(f"Verse: {a}")
# yield response.follow(a, callback=self.parse_verses)