This commit is contained in:
boose_magoose 2023-12-19 06:01:14 -05:00
parent a5236b9dc0
commit 22e3584494
8 changed files with 63 additions and 803237 deletions

146
main.py
View File

@ -1,4 +1,3 @@
import itertools
import re import re
import string import string
import sys import sys
@ -10,86 +9,10 @@ from bs4 import BeautifulSoup
from loguru import logger from loguru import logger
from tqdm import tqdm from tqdm import tqdm
base = Path('./xml/Matthew Henry Commentary 2').absolute() base = Path('./xml/Matthew Henry Commentary').absolute()
logger.remove(0) logger.remove(0)
logger.add('log.txt')
from functools import partialmethod
logger.__class__.foobar = partialmethod(logger.__class__.log, "foobar", level='WARNING')
logger.add('log.txt', level='WARNING')
convert = dict([
("Eccl", "Ecclesiastes"),
("Rom", "Romans"),
("2Tim", "2Timothy"),
("2Pet", "2Peter"),
("Gen", "Genesis"),
("John", "John"),
("1Cor", "1Corinthians"),
("Ezek", "Ezekiel"),
("Heb", "Hebrews"),
("Rev", "Revelation"),
("Eph", "Ephesians"),
("Isa", "Isaiah"),
("Ps", "Psalms"),
("Acts", "Acts"),
("Deut", "Deuteronomy"),
("Neh", "Nehemiah"),
("1Chr", "1Chronicles"),
("2Kgs", "2Kings"),
("2Cor", "2Corinthians"),
("Hos", "Hosea"),
("Exod", "Exodus"),
("Matt", "Matthew"),
("Job", "Job"),
("Prov", "Proverbs"),
("Col", "Colossians"),
("Jer", "Jeremiah"),
("1John", "1John"),
("Jas", "James"),
("1Tim", "1Timothy"),
("Amos", "Amos"),
("Mic", "Micah"),
("Dan", "Daniel"),
("Jonah", "Jonah"),
("Luke", "Luke"),
("Mal", "Malachi"),
("Zech", "Zechariah"),
("Phil", "Philippians"),
("Mark", "Mark"),
("2Thess", "2Thessalonians"),
("Jude", "Jude"),
("1Sam", "1Samuel"),
("Num", "Numbers"),
("Gal", "Galatians"),
("1Pet", "1Peter"),
("Lev", "Leviticus"),
("2Chr", "2Chronicles"),
("Lam", "Lamentations"),
("1Kgs", "1Kings"),
("Ezra", "Ezra"),
("Obad", "Obadiah"),
("Hab", "Habakkuk"),
("2Sam", "2Samuel"),
("Josh", "Joshua"),
("Judg", "Judges"),
("Ruth", "Ruth"),
("Esth", "Esther"),
("Phlm", "Philemon"),
("1Thess", "1Thessalonians"),
("Joel", "Joel"),
("Titus", "Titus"),
("3John", "3John"),
("Zeph", "Zephaniah"),
("Song", "Song of Songs"),
("Hag", "Haggai"),
("2John", "2John"),
("Nah", "Nahum"),
("Psa", "Psalms"),
("2Peter", "2Peter")
]
)
class MyHTMLParser(HTMLParser): class MyHTMLParser(HTMLParser):
file = None file = None
@ -229,17 +152,15 @@ class MyHTMLParser(HTMLParser):
file.write(line) file.write(line)
def normalize_osis_verses(self, verses): @staticmethod
"""Takes this def normalize_osis_verses(verses):
'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5' """Takes this 'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25'
and turns it into somthing more readable""" and turns it into somthing more readable"""
normalized_refs = list() normalized_refs = list()
for chunks in verses.removeprefix("Bible:").split('Bible:'): for chunks in verses.removeprefix("Bible:").split('Bible:'):
spanned_verses = list() spanned_verses = list()
for ref in chunks.split('-'): for ref in chunks.split('-'):
verse = ref.replace('.', ' ', 1).replace('.', ':') verse = ref.replace('.', ' ', 1).replace('.', ':')
book = ref.split('.')[0]
verse.replace(book, convert[book])
spanned_verses.append( spanned_verses.append(
bible.format_scripture_references( bible.format_scripture_references(
bible.get_references(verse.strip()) bible.get_references(verse.strip())
@ -253,57 +174,16 @@ class MyHTMLParser(HTMLParser):
except: except:
logger.warning(f"Error with: {verses=}: {spanned_verses=}") logger.warning(f"Error with: {verses=}: {spanned_verses=}")
raise raise
if normalized is False:
print(f"Error with: {verses=}: {spanned_verses=}")
sys.exit()
normalized_refs.append(normalized) normalized_refs.append(normalized)
return ';'.join(normalized_refs) return ';'.join(normalized_refs)
# @staticmethod
def old_normalize_osis_verses(self, verses):
"""Takes this
'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5'
and turns it into somthing more readable"""
normalized_refs = list()
for chunks in verses.removeprefix("Bible:").split('Bible:'):
spanned_verses = chunks.split('-')
if len(spanned_verses) < 2:
spanned_verses = list(itertools.repeat(spanned_verses[0], 2))
book, start_chapter, start_verse = spanned_verses[0].split('.')
_, end_chapter, end_verse = spanned_verses[1].split('.')
book = bible.Book[convert[book].upper()]
normalized = bible.NormalizedReference(
book=book,
start_chapter=start_chapter,
start_verse=start_verse,
end_chapter=end_chapter,
end_verse=end_verse,
)
if normalized is False:
logger.foobar(f'{self.args=}')
normalized_refs.append(
bible.format_scripture_references([normalized]).strip()
)
return ';'.join(normalized_refs)
@staticmethod @staticmethod
@logger.catch(reraise=False)
def obsidian_links(verses): def obsidian_links(verses):
"""Take `Book Chapter:Ver-Ver and turn it into a obsidian bible link""" """Take `Book Chapter:Ver-Ver and turn it into a obsidian bible link"""
links = [] links = []
for verse in verses.split(';'): for verse in verses.split(';'):
verse = verse.replace(':', '#').replace('-', '..') links.append(f"[[{verse.replace(':', '#').replace('-', '..')}]]")
if verse[0] in string.digits:
verse.replace(' ', '', 1)
links.append(f"[[{verse}]]")
return '; '.join(links) return '; '.join(links)
@ -357,10 +237,9 @@ class MyHTMLParser(HTMLParser):
self.attrs['osisref'] self.attrs['osisref']
) )
else: else:
# verses = self.normalize_osis_verses( verses = self.normalize_osis_verses(
# self.convert_passage(self.attrs['passage']) self.convert_passage(self.attrs['passage'])
# ) )
raise ValueError('No `osisref` in tag attrs')
self.write_to_file(self.obsidian_links(verses)) self.write_to_file(self.obsidian_links(verses))
@ -377,12 +256,6 @@ class MyHTMLParser(HTMLParser):
logger.debug(self.passage_verse) logger.debug(self.passage_verse)
raise raise
if verse.start_verse == verse.end_verse:
self.write_to_file('## Verse: ')
self.write_to_file(
f'{verse.start_verse}'
)
else:
self.write_to_file('## Verses: ') self.write_to_file('## Verses: ')
self.write_to_file( self.write_to_file(
f'{verse.start_verse} - {verse.end_verse}' f'{verse.start_verse} - {verse.end_verse}'
@ -392,7 +265,6 @@ class MyHTMLParser(HTMLParser):
# Need this do the regex in self.clean does not pick # Need this do the regex in self.clean does not pick
# this up. Will be also cleaned in the same function. # this up. Will be also cleaned in the same function.
self.write_to_file('Passage: ') self.write_to_file('Passage: ')
if self.tag_type == 'end': if self.tag_type == 'end':
self.write_to_file('\n\n') self.write_to_file('\n\n')

View File

@ -1,30 +1,28 @@
import time import time
from typing import Any from typing import Any
from pathlib import Path
import scrapy import scrapy
from scrapy.http import Response from scrapy.http import Response
from loguru import logger from loguru import logger
# logger.remove(0) logger.remove(0)
logger.add('pages.txt') # logger.add('pages.txt')
class spider(scrapy.Spider): class spider(scrapy.Spider):
name = 'matty' name = 'matty'
base_folder = Path('./scraps/').absolute()
# allowed_domains = ['https://www.biblegateway.com/']
base_url = 'https://www.biblegateway.com/resources/matthew-henry' base_url = 'https://www.biblegateway.com/resources/matthew-henry'
start_urls = [f'{base_url}/toc'] start_urls = [f'{base_url}/toc']
custom_settings = { custom_settings = {
"AUTOTHROTTLE_ENABLED": True, # "AUTOTHROTTLE_ENABLED": True,
# 'DOWNLOAD_DELAY': 2.5, # 'DOWNLOAD_DELAY': 1.5,
} }
books = 0
chapters = 0
sections = 0
def parse(self, response: Response, **kwargs: Any) -> Any: def parse(self, response: Response, **kwargs: Any) -> Any:
self.crawler.stats.set_value('books', 0) self.crawler.stats.set_value('books', 0)
self.crawler.stats.set_value('chapters', 0) self.crawler.stats.set_value('chapters', 0)
@ -35,18 +33,52 @@ class spider(scrapy.Spider):
self.crawler.stats.inc_value('books') self.crawler.stats.inc_value('books')
yield response.follow(a, callback=self.parse_book) yield response.follow(a, callback=self.parse_book)
def parse_book(self, response: Response, **kwargs: Any) -> Any: def parse_book(self, response: Response, **kwargs: Any) -> Any:
# for a in response.copy().css('div.article a').getall(): # for a in response.copy().css('div.article a').getall():
# logger.info(a) # logger.info(a)
for a in response.css('ul.resource-list a').getall(): # book = response.url.split(r'/')[-1]
if 'Chapter' in a:
for a in response.css('ul.resource-list a'):
logger.debug(a)
# if 'Chapter' in a:
self.crawler.stats.inc_value('chapters') self.crawler.stats.inc_value('chapters')
# logger.info(f"Chapter: {a}") # logger.info(f"Chapter: {a}")
# yield response.follow(a, callback=self.parse_chapter)
if 'Verse' in a: if 'CHAPTER' in a.css('::text')[0].get().upper():
self.crawler.stats.inc_value('sections') yield response.follow(a, callback=self.get_chapter)
else:
yield response.follow(a, callback=self.get_data)
# break
# if 'Verse' in a:
# self.crawler.stats.inc_value('sections')
# logger.info(f"Verse: {a}") # logger.info(f"Verse: {a}")
# yield response.follow(a, callback=self.parse_verses) # yield response.follow(a, callback=self.parse_verses)
file_name = response.url.split('/')[-1].replace('.', '_') + '.html'
logger.warning(file_name)
data = response.css('div.article p').getall()
self.write_data(file_name, data)
def get_data(self, response: Response, **kwargs: Any) -> Any:
file_name = response.url.split('/')[-1].replace('.', '_') + '.html'
logger.warning(file_name)
data = response.css('div.article p').getall()
self.write_data(file_name, data)
def get_chapter(self, response: Response, **kwargs: Any) -> Any:
file_name = response.url.split('/')[-1].replace('.', '_') + '.html'
file_name = f'chapter_{file_name}'
logger.warning(file_name)
data = response.css('div.article p').getall()
self.write_data(file_name, data)
def write_data(self, filename, data):
file = self.base_folder / filename
file.parent.mkdir(exist_ok=True, parents=True)
with open(file, 'w+') as f:
for d in data:
f.write(d)
f.write('\n')

101995
v/vol_1.xml

File diff suppressed because it is too large Load Diff

126303
v/vol_2.xml

File diff suppressed because it is too large Load Diff

167880
v/vol_4.xml

File diff suppressed because it is too large Load Diff

142852
v/vol_5.xml

File diff suppressed because it is too large Load Diff

131252
v/vol_6.xml

File diff suppressed because it is too large Load Diff

132796
vol_3.xml

File diff suppressed because it is too large Load Diff