This commit is contained in:
boose_magoose 2023-12-19 06:01:14 -05:00
parent a5236b9dc0
commit 22e3584494
8 changed files with 63 additions and 803237 deletions

146
main.py
View File

@ -1,4 +1,3 @@
import itertools
import re
import string
import sys
@ -10,86 +9,10 @@ from bs4 import BeautifulSoup
from loguru import logger
from tqdm import tqdm
base = Path('./xml/Matthew Henry Commentary 2').absolute()
base = Path('./xml/Matthew Henry Commentary').absolute()
logger.remove(0)
from functools import partialmethod
logger.__class__.foobar = partialmethod(logger.__class__.log, "foobar", level='WARNING')
logger.add('log.txt', level='WARNING')
convert = dict([
("Eccl", "Ecclesiastes"),
("Rom", "Romans"),
("2Tim", "2Timothy"),
("2Pet", "2Peter"),
("Gen", "Genesis"),
("John", "John"),
("1Cor", "1Corinthians"),
("Ezek", "Ezekiel"),
("Heb", "Hebrews"),
("Rev", "Revelation"),
("Eph", "Ephesians"),
("Isa", "Isaiah"),
("Ps", "Psalms"),
("Acts", "Acts"),
("Deut", "Deuteronomy"),
("Neh", "Nehemiah"),
("1Chr", "1Chronicles"),
("2Kgs", "2Kings"),
("2Cor", "2Corinthians"),
("Hos", "Hosea"),
("Exod", "Exodus"),
("Matt", "Matthew"),
("Job", "Job"),
("Prov", "Proverbs"),
("Col", "Colossians"),
("Jer", "Jeremiah"),
("1John", "1John"),
("Jas", "James"),
("1Tim", "1Timothy"),
("Amos", "Amos"),
("Mic", "Micah"),
("Dan", "Daniel"),
("Jonah", "Jonah"),
("Luke", "Luke"),
("Mal", "Malachi"),
("Zech", "Zechariah"),
("Phil", "Philippians"),
("Mark", "Mark"),
("2Thess", "2Thessalonians"),
("Jude", "Jude"),
("1Sam", "1Samuel"),
("Num", "Numbers"),
("Gal", "Galatians"),
("1Pet", "1Peter"),
("Lev", "Leviticus"),
("2Chr", "2Chronicles"),
("Lam", "Lamentations"),
("1Kgs", "1Kings"),
("Ezra", "Ezra"),
("Obad", "Obadiah"),
("Hab", "Habakkuk"),
("2Sam", "2Samuel"),
("Josh", "Joshua"),
("Judg", "Judges"),
("Ruth", "Ruth"),
("Esth", "Esther"),
("Phlm", "Philemon"),
("1Thess", "1Thessalonians"),
("Joel", "Joel"),
("Titus", "Titus"),
("3John", "3John"),
("Zeph", "Zephaniah"),
("Song", "Song of Songs"),
("Hag", "Haggai"),
("2John", "2John"),
("Nah", "Nahum"),
("Psa", "Psalms"),
("2Peter", "2Peter")
]
)
logger.add('log.txt')
class MyHTMLParser(HTMLParser):
file = None
@ -229,17 +152,15 @@ class MyHTMLParser(HTMLParser):
file.write(line)
def normalize_osis_verses(self, verses):
"""Takes this
'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5'
@staticmethod
def normalize_osis_verses(verses):
"""Takes this 'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25'
and turns it into somthing more readable"""
normalized_refs = list()
for chunks in verses.removeprefix("Bible:").split('Bible:'):
spanned_verses = list()
for ref in chunks.split('-'):
verse = ref.replace('.', ' ', 1).replace('.', ':')
book = ref.split('.')[0]
verse.replace(book, convert[book])
spanned_verses.append(
bible.format_scripture_references(
bible.get_references(verse.strip())
@ -253,57 +174,16 @@ class MyHTMLParser(HTMLParser):
except:
logger.warning(f"Error with: {verses=}: {spanned_verses=}")
raise
if normalized is False:
print(f"Error with: {verses=}: {spanned_verses=}")
sys.exit()
normalized_refs.append(normalized)
return ';'.join(normalized_refs)
# @staticmethod
def old_normalize_osis_verses(self, verses):
"""Takes this
'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5'
and turns it into somthing more readable"""
normalized_refs = list()
for chunks in verses.removeprefix("Bible:").split('Bible:'):
spanned_verses = chunks.split('-')
if len(spanned_verses) < 2:
spanned_verses = list(itertools.repeat(spanned_verses[0], 2))
book, start_chapter, start_verse = spanned_verses[0].split('.')
_, end_chapter, end_verse = spanned_verses[1].split('.')
book = bible.Book[convert[book].upper()]
normalized = bible.NormalizedReference(
book=book,
start_chapter=start_chapter,
start_verse=start_verse,
end_chapter=end_chapter,
end_verse=end_verse,
)
if normalized is False:
logger.foobar(f'{self.args=}')
normalized_refs.append(
bible.format_scripture_references([normalized]).strip()
)
return ';'.join(normalized_refs)
@staticmethod
@logger.catch(reraise=False)
def obsidian_links(verses):
"""Take `Book Chapter:Ver-Ver and turn it into a obsidian bible link"""
links = []
for verse in verses.split(';'):
verse = verse.replace(':', '#').replace('-', '..')
if verse[0] in string.digits:
verse.replace(' ', '', 1)
links.append(f"[[{verse}]]")
links.append(f"[[{verse.replace(':', '#').replace('-', '..')}]]")
return '; '.join(links)
@ -357,10 +237,9 @@ class MyHTMLParser(HTMLParser):
self.attrs['osisref']
)
else:
# verses = self.normalize_osis_verses(
# self.convert_passage(self.attrs['passage'])
# )
raise ValueError('No `osisref` in tag attrs')
verses = self.normalize_osis_verses(
self.convert_passage(self.attrs['passage'])
)
self.write_to_file(self.obsidian_links(verses))
@ -377,12 +256,6 @@ class MyHTMLParser(HTMLParser):
logger.debug(self.passage_verse)
raise
if verse.start_verse == verse.end_verse:
self.write_to_file('## Verse: ')
self.write_to_file(
f'{verse.start_verse}'
)
else:
self.write_to_file('## Verses: ')
self.write_to_file(
f'{verse.start_verse} - {verse.end_verse}'
@ -392,7 +265,6 @@ class MyHTMLParser(HTMLParser):
# Need this do the regex in self.clean does not pick
# this up. Will be also cleaned in the same function.
self.write_to_file('Passage: ')
if self.tag_type == 'end':
self.write_to_file('\n\n')

View File

@ -1,30 +1,28 @@
import time
from typing import Any
from pathlib import Path
import scrapy
from scrapy.http import Response
from loguru import logger
# logger.remove(0)
logger.add('pages.txt')
logger.remove(0)
# logger.add('pages.txt')
class spider(scrapy.Spider):
name = 'matty'
base_folder = Path('./scraps/').absolute()
# allowed_domains = ['https://www.biblegateway.com/']
base_url = 'https://www.biblegateway.com/resources/matthew-henry'
start_urls = [f'{base_url}/toc']
custom_settings = {
"AUTOTHROTTLE_ENABLED": True,
# 'DOWNLOAD_DELAY': 2.5,
# "AUTOTHROTTLE_ENABLED": True,
# 'DOWNLOAD_DELAY': 1.5,
}
books = 0
chapters = 0
sections = 0
def parse(self, response: Response, **kwargs: Any) -> Any:
self.crawler.stats.set_value('books', 0)
self.crawler.stats.set_value('chapters', 0)
@ -35,18 +33,52 @@ class spider(scrapy.Spider):
self.crawler.stats.inc_value('books')
yield response.follow(a, callback=self.parse_book)
def parse_book(self, response: Response, **kwargs: Any) -> Any:
# for a in response.copy().css('div.article a').getall():
# logger.info(a)
for a in response.css('ul.resource-list a').getall():
if 'Chapter' in a:
# book = response.url.split(r'/')[-1]
for a in response.css('ul.resource-list a'):
logger.debug(a)
# if 'Chapter' in a:
self.crawler.stats.inc_value('chapters')
# logger.info(f"Chapter: {a}")
# yield response.follow(a, callback=self.parse_chapter)
if 'Verse' in a:
self.crawler.stats.inc_value('sections')
if 'CHAPTER' in a.css('::text')[0].get().upper():
yield response.follow(a, callback=self.get_chapter)
else:
yield response.follow(a, callback=self.get_data)
# break
# if 'Verse' in a:
# self.crawler.stats.inc_value('sections')
# logger.info(f"Verse: {a}")
# yield response.follow(a, callback=self.parse_verses)
file_name = response.url.split('/')[-1].replace('.', '_') + '.html'
logger.warning(file_name)
data = response.css('div.article p').getall()
self.write_data(file_name, data)
def get_data(self, response: Response, **kwargs: Any) -> Any:
file_name = response.url.split('/')[-1].replace('.', '_') + '.html'
logger.warning(file_name)
data = response.css('div.article p').getall()
self.write_data(file_name, data)
def get_chapter(self, response: Response, **kwargs: Any) -> Any:
file_name = response.url.split('/')[-1].replace('.', '_') + '.html'
file_name = f'chapter_{file_name}'
logger.warning(file_name)
data = response.css('div.article p').getall()
self.write_data(file_name, data)
def write_data(self, filename, data):
file = self.base_folder / filename
file.parent.mkdir(exist_ok=True, parents=True)
with open(file, 'w+') as f:
for d in data:
f.write(d)
f.write('\n')

101995
v/vol_1.xml

File diff suppressed because it is too large Load Diff

126303
v/vol_2.xml

File diff suppressed because it is too large Load Diff

167880
v/vol_4.xml

File diff suppressed because it is too large Load Diff

142852
v/vol_5.xml

File diff suppressed because it is too large Load Diff

131252
v/vol_6.xml

File diff suppressed because it is too large Load Diff

132796
vol_3.xml

File diff suppressed because it is too large Load Diff