current
This commit is contained in:
parent
a5236b9dc0
commit
22e3584494
154
main.py
154
main.py
@ -1,4 +1,3 @@
|
||||
import itertools
|
||||
import re
|
||||
import string
|
||||
import sys
|
||||
@ -10,86 +9,10 @@ from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
from tqdm import tqdm
|
||||
|
||||
base = Path('./xml/Matthew Henry Commentary 2').absolute()
|
||||
base = Path('./xml/Matthew Henry Commentary').absolute()
|
||||
|
||||
logger.remove(0)
|
||||
|
||||
from functools import partialmethod
|
||||
|
||||
logger.__class__.foobar = partialmethod(logger.__class__.log, "foobar", level='WARNING')
|
||||
logger.add('log.txt', level='WARNING')
|
||||
|
||||
convert = dict([
|
||||
("Eccl", "Ecclesiastes"),
|
||||
("Rom", "Romans"),
|
||||
("2Tim", "2Timothy"),
|
||||
("2Pet", "2Peter"),
|
||||
("Gen", "Genesis"),
|
||||
("John", "John"),
|
||||
("1Cor", "1Corinthians"),
|
||||
("Ezek", "Ezekiel"),
|
||||
("Heb", "Hebrews"),
|
||||
("Rev", "Revelation"),
|
||||
("Eph", "Ephesians"),
|
||||
("Isa", "Isaiah"),
|
||||
("Ps", "Psalms"),
|
||||
("Acts", "Acts"),
|
||||
("Deut", "Deuteronomy"),
|
||||
("Neh", "Nehemiah"),
|
||||
("1Chr", "1Chronicles"),
|
||||
("2Kgs", "2Kings"),
|
||||
("2Cor", "2Corinthians"),
|
||||
("Hos", "Hosea"),
|
||||
("Exod", "Exodus"),
|
||||
("Matt", "Matthew"),
|
||||
("Job", "Job"),
|
||||
("Prov", "Proverbs"),
|
||||
("Col", "Colossians"),
|
||||
("Jer", "Jeremiah"),
|
||||
("1John", "1John"),
|
||||
("Jas", "James"),
|
||||
("1Tim", "1Timothy"),
|
||||
("Amos", "Amos"),
|
||||
("Mic", "Micah"),
|
||||
("Dan", "Daniel"),
|
||||
("Jonah", "Jonah"),
|
||||
("Luke", "Luke"),
|
||||
("Mal", "Malachi"),
|
||||
("Zech", "Zechariah"),
|
||||
("Phil", "Philippians"),
|
||||
("Mark", "Mark"),
|
||||
("2Thess", "2Thessalonians"),
|
||||
("Jude", "Jude"),
|
||||
("1Sam", "1Samuel"),
|
||||
("Num", "Numbers"),
|
||||
("Gal", "Galatians"),
|
||||
("1Pet", "1Peter"),
|
||||
("Lev", "Leviticus"),
|
||||
("2Chr", "2Chronicles"),
|
||||
("Lam", "Lamentations"),
|
||||
("1Kgs", "1Kings"),
|
||||
("Ezra", "Ezra"),
|
||||
("Obad", "Obadiah"),
|
||||
("Hab", "Habakkuk"),
|
||||
("2Sam", "2Samuel"),
|
||||
("Josh", "Joshua"),
|
||||
("Judg", "Judges"),
|
||||
("Ruth", "Ruth"),
|
||||
("Esth", "Esther"),
|
||||
("Phlm", "Philemon"),
|
||||
("1Thess", "1Thessalonians"),
|
||||
("Joel", "Joel"),
|
||||
("Titus", "Titus"),
|
||||
("3John", "3John"),
|
||||
("Zeph", "Zephaniah"),
|
||||
("Song", "Song of Songs"),
|
||||
("Hag", "Haggai"),
|
||||
("2John", "2John"),
|
||||
("Nah", "Nahum"),
|
||||
("Psa", "Psalms"),
|
||||
("2Peter", "2Peter")
|
||||
]
|
||||
)
|
||||
logger.add('log.txt')
|
||||
|
||||
class MyHTMLParser(HTMLParser):
|
||||
file = None
|
||||
@ -229,17 +152,15 @@ class MyHTMLParser(HTMLParser):
|
||||
|
||||
file.write(line)
|
||||
|
||||
def normalize_osis_verses(self, verses):
|
||||
"""Takes this
|
||||
'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5'
|
||||
@staticmethod
|
||||
def normalize_osis_verses(verses):
|
||||
"""Takes this 'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25'
|
||||
and turns it into somthing more readable"""
|
||||
normalized_refs = list()
|
||||
for chunks in verses.removeprefix("Bible:").split('Bible:'):
|
||||
spanned_verses = list()
|
||||
for ref in chunks.split('-'):
|
||||
verse = ref.replace('.', ' ', 1).replace('.', ':')
|
||||
book = ref.split('.')[0]
|
||||
verse.replace(book, convert[book])
|
||||
spanned_verses.append(
|
||||
bible.format_scripture_references(
|
||||
bible.get_references(verse.strip())
|
||||
@ -253,57 +174,16 @@ class MyHTMLParser(HTMLParser):
|
||||
except:
|
||||
logger.warning(f"Error with: {verses=}: {spanned_verses=}")
|
||||
raise
|
||||
|
||||
if normalized is False:
|
||||
print(f"Error with: {verses=}: {spanned_verses=}")
|
||||
sys.exit()
|
||||
|
||||
normalized_refs.append(normalized)
|
||||
|
||||
return ';'.join(normalized_refs)
|
||||
|
||||
# @staticmethod
|
||||
def old_normalize_osis_verses(self, verses):
|
||||
"""Takes this
|
||||
'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5'
|
||||
and turns it into somthing more readable"""
|
||||
normalized_refs = list()
|
||||
for chunks in verses.removeprefix("Bible:").split('Bible:'):
|
||||
spanned_verses = chunks.split('-')
|
||||
if len(spanned_verses) < 2:
|
||||
spanned_verses = list(itertools.repeat(spanned_verses[0], 2))
|
||||
|
||||
book, start_chapter, start_verse = spanned_verses[0].split('.')
|
||||
_, end_chapter, end_verse = spanned_verses[1].split('.')
|
||||
book = bible.Book[convert[book].upper()]
|
||||
|
||||
normalized = bible.NormalizedReference(
|
||||
book=book,
|
||||
start_chapter=start_chapter,
|
||||
start_verse=start_verse,
|
||||
end_chapter=end_chapter,
|
||||
end_verse=end_verse,
|
||||
)
|
||||
|
||||
if normalized is False:
|
||||
logger.foobar(f'{self.args=}')
|
||||
|
||||
normalized_refs.append(
|
||||
bible.format_scripture_references([normalized]).strip()
|
||||
)
|
||||
|
||||
return ';'.join(normalized_refs)
|
||||
|
||||
@staticmethod
|
||||
@logger.catch(reraise=False)
|
||||
def obsidian_links(verses):
|
||||
"""Take `Book Chapter:Ver-Ver and turn it into a obsidian bible link"""
|
||||
links = []
|
||||
for verse in verses.split(';'):
|
||||
verse = verse.replace(':', '#').replace('-', '..')
|
||||
if verse[0] in string.digits:
|
||||
verse.replace(' ', '', 1)
|
||||
links.append(f"[[{verse}]]")
|
||||
links.append(f"[[{verse.replace(':', '#').replace('-', '..')}]]")
|
||||
|
||||
return '; '.join(links)
|
||||
|
||||
@ -357,10 +237,9 @@ class MyHTMLParser(HTMLParser):
|
||||
self.attrs['osisref']
|
||||
)
|
||||
else:
|
||||
# verses = self.normalize_osis_verses(
|
||||
# self.convert_passage(self.attrs['passage'])
|
||||
# )
|
||||
raise ValueError('No `osisref` in tag attrs')
|
||||
verses = self.normalize_osis_verses(
|
||||
self.convert_passage(self.attrs['passage'])
|
||||
)
|
||||
|
||||
self.write_to_file(self.obsidian_links(verses))
|
||||
|
||||
@ -377,22 +256,15 @@ class MyHTMLParser(HTMLParser):
|
||||
logger.debug(self.passage_verse)
|
||||
raise
|
||||
|
||||
if verse.start_verse == verse.end_verse:
|
||||
self.write_to_file('## Verse: ')
|
||||
self.write_to_file(
|
||||
f'{verse.start_verse}'
|
||||
)
|
||||
else:
|
||||
self.write_to_file('## Verses: ')
|
||||
self.write_to_file(
|
||||
f'{verse.start_verse} - {verse.end_verse}'
|
||||
)
|
||||
self.write_to_file('## Verses: ')
|
||||
self.write_to_file(
|
||||
f'{verse.start_verse} - {verse.end_verse}'
|
||||
)
|
||||
self.write_to_file('\n\n')
|
||||
|
||||
# Need this do the regex in self.clean does not pick
|
||||
# this up. Will be also cleaned in the same function.
|
||||
self.write_to_file('Passage: ')
|
||||
|
||||
if self.tag_type == 'end':
|
||||
self.write_to_file('\n\n')
|
||||
|
||||
|
68
matty.py
68
matty.py
@ -1,30 +1,28 @@
|
||||
import time
|
||||
from typing import Any
|
||||
from pathlib import Path
|
||||
|
||||
import scrapy
|
||||
from scrapy.http import Response
|
||||
|
||||
from loguru import logger
|
||||
|
||||
# logger.remove(0)
|
||||
logger.add('pages.txt')
|
||||
logger.remove(0)
|
||||
# logger.add('pages.txt')
|
||||
|
||||
|
||||
class spider(scrapy.Spider):
|
||||
name = 'matty'
|
||||
base_folder = Path('./scraps/').absolute()
|
||||
# allowed_domains = ['https://www.biblegateway.com/']
|
||||
base_url = 'https://www.biblegateway.com/resources/matthew-henry'
|
||||
start_urls = [f'{base_url}/toc']
|
||||
|
||||
custom_settings = {
|
||||
"AUTOTHROTTLE_ENABLED": True,
|
||||
# 'DOWNLOAD_DELAY': 2.5,
|
||||
# "AUTOTHROTTLE_ENABLED": True,
|
||||
# 'DOWNLOAD_DELAY': 1.5,
|
||||
}
|
||||
|
||||
books = 0
|
||||
chapters = 0
|
||||
sections = 0
|
||||
|
||||
|
||||
def parse(self, response: Response, **kwargs: Any) -> Any:
|
||||
self.crawler.stats.set_value('books', 0)
|
||||
self.crawler.stats.set_value('chapters', 0)
|
||||
@ -35,18 +33,52 @@ class spider(scrapy.Spider):
|
||||
self.crawler.stats.inc_value('books')
|
||||
yield response.follow(a, callback=self.parse_book)
|
||||
|
||||
|
||||
def parse_book(self, response: Response, **kwargs: Any) -> Any:
|
||||
# for a in response.copy().css('div.article a').getall():
|
||||
# logger.info(a)
|
||||
|
||||
for a in response.css('ul.resource-list a').getall():
|
||||
if 'Chapter' in a:
|
||||
self.crawler.stats.inc_value('chapters')
|
||||
# logger.info(f"Chapter: {a}")
|
||||
# yield response.follow(a, callback=self.parse_chapter)
|
||||
# book = response.url.split(r'/')[-1]
|
||||
|
||||
if 'Verse' in a:
|
||||
self.crawler.stats.inc_value('sections')
|
||||
# logger.info(f"Verse: {a}")
|
||||
# yield response.follow(a, callback=self.parse_verses)
|
||||
for a in response.css('ul.resource-list a'):
|
||||
logger.debug(a)
|
||||
# if 'Chapter' in a:
|
||||
self.crawler.stats.inc_value('chapters')
|
||||
# logger.info(f"Chapter: {a}")
|
||||
|
||||
if 'CHAPTER' in a.css('::text')[0].get().upper():
|
||||
yield response.follow(a, callback=self.get_chapter)
|
||||
else:
|
||||
yield response.follow(a, callback=self.get_data)
|
||||
# break
|
||||
# if 'Verse' in a:
|
||||
# self.crawler.stats.inc_value('sections')
|
||||
# logger.info(f"Verse: {a}")
|
||||
# yield response.follow(a, callback=self.parse_verses)
|
||||
|
||||
file_name = response.url.split('/')[-1].replace('.', '_') + '.html'
|
||||
logger.warning(file_name)
|
||||
data = response.css('div.article p').getall()
|
||||
self.write_data(file_name, data)
|
||||
|
||||
def get_data(self, response: Response, **kwargs: Any) -> Any:
|
||||
file_name = response.url.split('/')[-1].replace('.', '_') + '.html'
|
||||
logger.warning(file_name)
|
||||
data = response.css('div.article p').getall()
|
||||
self.write_data(file_name, data)
|
||||
|
||||
def get_chapter(self, response: Response, **kwargs: Any) -> Any:
|
||||
file_name = response.url.split('/')[-1].replace('.', '_') + '.html'
|
||||
file_name = f'chapter_{file_name}'
|
||||
logger.warning(file_name)
|
||||
data = response.css('div.article p').getall()
|
||||
self.write_data(file_name, data)
|
||||
|
||||
def write_data(self, filename, data):
|
||||
file = self.base_folder / filename
|
||||
file.parent.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
with open(file, 'w+') as f:
|
||||
for d in data:
|
||||
f.write(d)
|
||||
f.write('\n')
|
||||
|
101995
v/vol_1.xml
101995
v/vol_1.xml
File diff suppressed because it is too large
Load Diff
126303
v/vol_2.xml
126303
v/vol_2.xml
File diff suppressed because it is too large
Load Diff
167880
v/vol_4.xml
167880
v/vol_4.xml
File diff suppressed because it is too large
Load Diff
142852
v/vol_5.xml
142852
v/vol_5.xml
File diff suppressed because it is too large
Load Diff
131252
v/vol_6.xml
131252
v/vol_6.xml
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user