current
This commit is contained in:
parent
9997abcbe6
commit
cc53817dd8
14
main.py
14
main.py
@ -381,10 +381,16 @@ class MyHTMLParser(HTMLParser):
|
|||||||
logger.debug(self.passage_verse)
|
logger.debug(self.passage_verse)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
self.write_to_file('## Verses: ')
|
if verse.start_verse == verse.end_verse:
|
||||||
self.write_to_file(
|
self.write_to_file('## Verse: ')
|
||||||
f'{verse.start_verse} - {verse.end_verse}'
|
self.write_to_file(
|
||||||
)
|
f'{verse.start_verse}'
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.write_to_file('## Verses: ')
|
||||||
|
self.write_to_file(
|
||||||
|
f'{verse.start_verse} - {verse.end_verse}'
|
||||||
|
)
|
||||||
self.write_to_file('\n\n')
|
self.write_to_file('\n\n')
|
||||||
|
|
||||||
# Need this do the regex in self.clean does not pick
|
# Need this do the regex in self.clean does not pick
|
||||||
|
52
matty.py
Normal file
52
matty.py
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
import time
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
from scrapy.http import Response
|
||||||
|
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
# logger.remove(0)
|
||||||
|
logger.add('pages.txt')
|
||||||
|
|
||||||
|
|
||||||
|
class spider(scrapy.Spider):
|
||||||
|
name = 'matty'
|
||||||
|
base_url = 'https://www.biblegateway.com/resources/matthew-henry'
|
||||||
|
start_urls = [f'{base_url}/toc']
|
||||||
|
|
||||||
|
custom_settings = {
|
||||||
|
"AUTOTHROTTLE_ENABLED": True,
|
||||||
|
# 'DOWNLOAD_DELAY': 2.5,
|
||||||
|
}
|
||||||
|
|
||||||
|
books = 0
|
||||||
|
chapters = 0
|
||||||
|
sections = 0
|
||||||
|
|
||||||
|
|
||||||
|
def parse(self, response: Response, **kwargs: Any) -> Any:
|
||||||
|
self.crawler.stats.set_value('books', 0)
|
||||||
|
self.crawler.stats.set_value('chapters', 0)
|
||||||
|
self.crawler.stats.set_value('sections', 0)
|
||||||
|
|
||||||
|
for a in response.css('ul.resource-list a'):
|
||||||
|
# logger.info(a)
|
||||||
|
self.crawler.stats.inc_value('books')
|
||||||
|
yield response.follow(a, callback=self.parse_book)
|
||||||
|
|
||||||
|
def parse_book(self, response: Response, **kwargs: Any) -> Any:
|
||||||
|
# for a in response.copy().css('div.article a').getall():
|
||||||
|
# logger.info(a)
|
||||||
|
|
||||||
|
for a in response.css('ul.resource-list a').getall():
|
||||||
|
if 'Chapter' in a:
|
||||||
|
self.crawler.stats.inc_value('chapters')
|
||||||
|
# logger.info(f"Chapter: {a}")
|
||||||
|
# yield response.follow(a, callback=self.parse_chapter)
|
||||||
|
|
||||||
|
if 'Verse' in a:
|
||||||
|
self.crawler.stats.inc_value('sections')
|
||||||
|
# logger.info(f"Verse: {a}")
|
||||||
|
# yield response.follow(a, callback=self.parse_verses)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user