mh_parser/matty.py
2023-12-19 06:01:14 -05:00

85 lines
2.8 KiB
Python

import time
from typing import Any
from pathlib import Path
import scrapy
from scrapy.http import Response
from loguru import logger
logger.remove(0)
# logger.add('pages.txt')
class spider(scrapy.Spider):
name = 'matty'
base_folder = Path('./scraps/').absolute()
# allowed_domains = ['https://www.biblegateway.com/']
base_url = 'https://www.biblegateway.com/resources/matthew-henry'
start_urls = [f'{base_url}/toc']
custom_settings = {
# "AUTOTHROTTLE_ENABLED": True,
# 'DOWNLOAD_DELAY': 1.5,
}
def parse(self, response: Response, **kwargs: Any) -> Any:
self.crawler.stats.set_value('books', 0)
self.crawler.stats.set_value('chapters', 0)
self.crawler.stats.set_value('sections', 0)
for a in response.css('ul.resource-list a'):
# logger.info(a)
self.crawler.stats.inc_value('books')
yield response.follow(a, callback=self.parse_book)
def parse_book(self, response: Response, **kwargs: Any) -> Any:
# for a in response.copy().css('div.article a').getall():
# logger.info(a)
# book = response.url.split(r'/')[-1]
for a in response.css('ul.resource-list a'):
logger.debug(a)
# if 'Chapter' in a:
self.crawler.stats.inc_value('chapters')
# logger.info(f"Chapter: {a}")
if 'CHAPTER' in a.css('::text')[0].get().upper():
yield response.follow(a, callback=self.get_chapter)
else:
yield response.follow(a, callback=self.get_data)
# break
# if 'Verse' in a:
# self.crawler.stats.inc_value('sections')
# logger.info(f"Verse: {a}")
# yield response.follow(a, callback=self.parse_verses)
file_name = response.url.split('/')[-1].replace('.', '_') + '.html'
logger.warning(file_name)
data = response.css('div.article p').getall()
self.write_data(file_name, data)
def get_data(self, response: Response, **kwargs: Any) -> Any:
file_name = response.url.split('/')[-1].replace('.', '_') + '.html'
logger.warning(file_name)
data = response.css('div.article p').getall()
self.write_data(file_name, data)
def get_chapter(self, response: Response, **kwargs: Any) -> Any:
file_name = response.url.split('/')[-1].replace('.', '_') + '.html'
file_name = f'chapter_{file_name}'
logger.warning(file_name)
data = response.css('div.article p').getall()
self.write_data(file_name, data)
def write_data(self, filename, data):
file = self.base_folder / filename
file.parent.mkdir(exist_ok=True, parents=True)
with open(file, 'w+') as f:
for d in data:
f.write(d)
f.write('\n')