current
This commit is contained in:
parent
a5236b9dc0
commit
22e3584494
154
main.py
154
main.py
@ -1,4 +1,3 @@
|
|||||||
import itertools
|
|
||||||
import re
|
import re
|
||||||
import string
|
import string
|
||||||
import sys
|
import sys
|
||||||
@ -10,86 +9,10 @@ from bs4 import BeautifulSoup
|
|||||||
from loguru import logger
|
from loguru import logger
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
base = Path('./xml/Matthew Henry Commentary 2').absolute()
|
base = Path('./xml/Matthew Henry Commentary').absolute()
|
||||||
|
|
||||||
logger.remove(0)
|
logger.remove(0)
|
||||||
|
logger.add('log.txt')
|
||||||
from functools import partialmethod
|
|
||||||
|
|
||||||
logger.__class__.foobar = partialmethod(logger.__class__.log, "foobar", level='WARNING')
|
|
||||||
logger.add('log.txt', level='WARNING')
|
|
||||||
|
|
||||||
convert = dict([
|
|
||||||
("Eccl", "Ecclesiastes"),
|
|
||||||
("Rom", "Romans"),
|
|
||||||
("2Tim", "2Timothy"),
|
|
||||||
("2Pet", "2Peter"),
|
|
||||||
("Gen", "Genesis"),
|
|
||||||
("John", "John"),
|
|
||||||
("1Cor", "1Corinthians"),
|
|
||||||
("Ezek", "Ezekiel"),
|
|
||||||
("Heb", "Hebrews"),
|
|
||||||
("Rev", "Revelation"),
|
|
||||||
("Eph", "Ephesians"),
|
|
||||||
("Isa", "Isaiah"),
|
|
||||||
("Ps", "Psalms"),
|
|
||||||
("Acts", "Acts"),
|
|
||||||
("Deut", "Deuteronomy"),
|
|
||||||
("Neh", "Nehemiah"),
|
|
||||||
("1Chr", "1Chronicles"),
|
|
||||||
("2Kgs", "2Kings"),
|
|
||||||
("2Cor", "2Corinthians"),
|
|
||||||
("Hos", "Hosea"),
|
|
||||||
("Exod", "Exodus"),
|
|
||||||
("Matt", "Matthew"),
|
|
||||||
("Job", "Job"),
|
|
||||||
("Prov", "Proverbs"),
|
|
||||||
("Col", "Colossians"),
|
|
||||||
("Jer", "Jeremiah"),
|
|
||||||
("1John", "1John"),
|
|
||||||
("Jas", "James"),
|
|
||||||
("1Tim", "1Timothy"),
|
|
||||||
("Amos", "Amos"),
|
|
||||||
("Mic", "Micah"),
|
|
||||||
("Dan", "Daniel"),
|
|
||||||
("Jonah", "Jonah"),
|
|
||||||
("Luke", "Luke"),
|
|
||||||
("Mal", "Malachi"),
|
|
||||||
("Zech", "Zechariah"),
|
|
||||||
("Phil", "Philippians"),
|
|
||||||
("Mark", "Mark"),
|
|
||||||
("2Thess", "2Thessalonians"),
|
|
||||||
("Jude", "Jude"),
|
|
||||||
("1Sam", "1Samuel"),
|
|
||||||
("Num", "Numbers"),
|
|
||||||
("Gal", "Galatians"),
|
|
||||||
("1Pet", "1Peter"),
|
|
||||||
("Lev", "Leviticus"),
|
|
||||||
("2Chr", "2Chronicles"),
|
|
||||||
("Lam", "Lamentations"),
|
|
||||||
("1Kgs", "1Kings"),
|
|
||||||
("Ezra", "Ezra"),
|
|
||||||
("Obad", "Obadiah"),
|
|
||||||
("Hab", "Habakkuk"),
|
|
||||||
("2Sam", "2Samuel"),
|
|
||||||
("Josh", "Joshua"),
|
|
||||||
("Judg", "Judges"),
|
|
||||||
("Ruth", "Ruth"),
|
|
||||||
("Esth", "Esther"),
|
|
||||||
("Phlm", "Philemon"),
|
|
||||||
("1Thess", "1Thessalonians"),
|
|
||||||
("Joel", "Joel"),
|
|
||||||
("Titus", "Titus"),
|
|
||||||
("3John", "3John"),
|
|
||||||
("Zeph", "Zephaniah"),
|
|
||||||
("Song", "Song of Songs"),
|
|
||||||
("Hag", "Haggai"),
|
|
||||||
("2John", "2John"),
|
|
||||||
("Nah", "Nahum"),
|
|
||||||
("Psa", "Psalms"),
|
|
||||||
("2Peter", "2Peter")
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
class MyHTMLParser(HTMLParser):
|
class MyHTMLParser(HTMLParser):
|
||||||
file = None
|
file = None
|
||||||
@ -229,17 +152,15 @@ class MyHTMLParser(HTMLParser):
|
|||||||
|
|
||||||
file.write(line)
|
file.write(line)
|
||||||
|
|
||||||
def normalize_osis_verses(self, verses):
|
@staticmethod
|
||||||
"""Takes this
|
def normalize_osis_verses(verses):
|
||||||
'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5'
|
"""Takes this 'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25'
|
||||||
and turns it into somthing more readable"""
|
and turns it into somthing more readable"""
|
||||||
normalized_refs = list()
|
normalized_refs = list()
|
||||||
for chunks in verses.removeprefix("Bible:").split('Bible:'):
|
for chunks in verses.removeprefix("Bible:").split('Bible:'):
|
||||||
spanned_verses = list()
|
spanned_verses = list()
|
||||||
for ref in chunks.split('-'):
|
for ref in chunks.split('-'):
|
||||||
verse = ref.replace('.', ' ', 1).replace('.', ':')
|
verse = ref.replace('.', ' ', 1).replace('.', ':')
|
||||||
book = ref.split('.')[0]
|
|
||||||
verse.replace(book, convert[book])
|
|
||||||
spanned_verses.append(
|
spanned_verses.append(
|
||||||
bible.format_scripture_references(
|
bible.format_scripture_references(
|
||||||
bible.get_references(verse.strip())
|
bible.get_references(verse.strip())
|
||||||
@ -253,57 +174,16 @@ class MyHTMLParser(HTMLParser):
|
|||||||
except:
|
except:
|
||||||
logger.warning(f"Error with: {verses=}: {spanned_verses=}")
|
logger.warning(f"Error with: {verses=}: {spanned_verses=}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
if normalized is False:
|
|
||||||
print(f"Error with: {verses=}: {spanned_verses=}")
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
normalized_refs.append(normalized)
|
normalized_refs.append(normalized)
|
||||||
|
|
||||||
return ';'.join(normalized_refs)
|
return ';'.join(normalized_refs)
|
||||||
|
|
||||||
# @staticmethod
|
|
||||||
def old_normalize_osis_verses(self, verses):
|
|
||||||
"""Takes this
|
|
||||||
'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5'
|
|
||||||
and turns it into somthing more readable"""
|
|
||||||
normalized_refs = list()
|
|
||||||
for chunks in verses.removeprefix("Bible:").split('Bible:'):
|
|
||||||
spanned_verses = chunks.split('-')
|
|
||||||
if len(spanned_verses) < 2:
|
|
||||||
spanned_verses = list(itertools.repeat(spanned_verses[0], 2))
|
|
||||||
|
|
||||||
book, start_chapter, start_verse = spanned_verses[0].split('.')
|
|
||||||
_, end_chapter, end_verse = spanned_verses[1].split('.')
|
|
||||||
book = bible.Book[convert[book].upper()]
|
|
||||||
|
|
||||||
normalized = bible.NormalizedReference(
|
|
||||||
book=book,
|
|
||||||
start_chapter=start_chapter,
|
|
||||||
start_verse=start_verse,
|
|
||||||
end_chapter=end_chapter,
|
|
||||||
end_verse=end_verse,
|
|
||||||
)
|
|
||||||
|
|
||||||
if normalized is False:
|
|
||||||
logger.foobar(f'{self.args=}')
|
|
||||||
|
|
||||||
normalized_refs.append(
|
|
||||||
bible.format_scripture_references([normalized]).strip()
|
|
||||||
)
|
|
||||||
|
|
||||||
return ';'.join(normalized_refs)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@logger.catch(reraise=False)
|
|
||||||
def obsidian_links(verses):
|
def obsidian_links(verses):
|
||||||
"""Take `Book Chapter:Ver-Ver and turn it into a obsidian bible link"""
|
"""Take `Book Chapter:Ver-Ver and turn it into a obsidian bible link"""
|
||||||
links = []
|
links = []
|
||||||
for verse in verses.split(';'):
|
for verse in verses.split(';'):
|
||||||
verse = verse.replace(':', '#').replace('-', '..')
|
links.append(f"[[{verse.replace(':', '#').replace('-', '..')}]]")
|
||||||
if verse[0] in string.digits:
|
|
||||||
verse.replace(' ', '', 1)
|
|
||||||
links.append(f"[[{verse}]]")
|
|
||||||
|
|
||||||
return '; '.join(links)
|
return '; '.join(links)
|
||||||
|
|
||||||
@ -357,10 +237,9 @@ class MyHTMLParser(HTMLParser):
|
|||||||
self.attrs['osisref']
|
self.attrs['osisref']
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# verses = self.normalize_osis_verses(
|
verses = self.normalize_osis_verses(
|
||||||
# self.convert_passage(self.attrs['passage'])
|
self.convert_passage(self.attrs['passage'])
|
||||||
# )
|
)
|
||||||
raise ValueError('No `osisref` in tag attrs')
|
|
||||||
|
|
||||||
self.write_to_file(self.obsidian_links(verses))
|
self.write_to_file(self.obsidian_links(verses))
|
||||||
|
|
||||||
@ -377,22 +256,15 @@ class MyHTMLParser(HTMLParser):
|
|||||||
logger.debug(self.passage_verse)
|
logger.debug(self.passage_verse)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
if verse.start_verse == verse.end_verse:
|
self.write_to_file('## Verses: ')
|
||||||
self.write_to_file('## Verse: ')
|
self.write_to_file(
|
||||||
self.write_to_file(
|
f'{verse.start_verse} - {verse.end_verse}'
|
||||||
f'{verse.start_verse}'
|
)
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.write_to_file('## Verses: ')
|
|
||||||
self.write_to_file(
|
|
||||||
f'{verse.start_verse} - {verse.end_verse}'
|
|
||||||
)
|
|
||||||
self.write_to_file('\n\n')
|
self.write_to_file('\n\n')
|
||||||
|
|
||||||
# Need this do the regex in self.clean does not pick
|
# Need this do the regex in self.clean does not pick
|
||||||
# this up. Will be also cleaned in the same function.
|
# this up. Will be also cleaned in the same function.
|
||||||
self.write_to_file('Passage: ')
|
self.write_to_file('Passage: ')
|
||||||
|
|
||||||
if self.tag_type == 'end':
|
if self.tag_type == 'end':
|
||||||
self.write_to_file('\n\n')
|
self.write_to_file('\n\n')
|
||||||
|
|
||||||
|
68
matty.py
68
matty.py
@ -1,30 +1,28 @@
|
|||||||
import time
|
import time
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import scrapy
|
import scrapy
|
||||||
from scrapy.http import Response
|
from scrapy.http import Response
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
# logger.remove(0)
|
logger.remove(0)
|
||||||
logger.add('pages.txt')
|
# logger.add('pages.txt')
|
||||||
|
|
||||||
|
|
||||||
class spider(scrapy.Spider):
|
class spider(scrapy.Spider):
|
||||||
name = 'matty'
|
name = 'matty'
|
||||||
|
base_folder = Path('./scraps/').absolute()
|
||||||
|
# allowed_domains = ['https://www.biblegateway.com/']
|
||||||
base_url = 'https://www.biblegateway.com/resources/matthew-henry'
|
base_url = 'https://www.biblegateway.com/resources/matthew-henry'
|
||||||
start_urls = [f'{base_url}/toc']
|
start_urls = [f'{base_url}/toc']
|
||||||
|
|
||||||
custom_settings = {
|
custom_settings = {
|
||||||
"AUTOTHROTTLE_ENABLED": True,
|
# "AUTOTHROTTLE_ENABLED": True,
|
||||||
# 'DOWNLOAD_DELAY': 2.5,
|
# 'DOWNLOAD_DELAY': 1.5,
|
||||||
}
|
}
|
||||||
|
|
||||||
books = 0
|
|
||||||
chapters = 0
|
|
||||||
sections = 0
|
|
||||||
|
|
||||||
|
|
||||||
def parse(self, response: Response, **kwargs: Any) -> Any:
|
def parse(self, response: Response, **kwargs: Any) -> Any:
|
||||||
self.crawler.stats.set_value('books', 0)
|
self.crawler.stats.set_value('books', 0)
|
||||||
self.crawler.stats.set_value('chapters', 0)
|
self.crawler.stats.set_value('chapters', 0)
|
||||||
@ -35,18 +33,52 @@ class spider(scrapy.Spider):
|
|||||||
self.crawler.stats.inc_value('books')
|
self.crawler.stats.inc_value('books')
|
||||||
yield response.follow(a, callback=self.parse_book)
|
yield response.follow(a, callback=self.parse_book)
|
||||||
|
|
||||||
|
|
||||||
def parse_book(self, response: Response, **kwargs: Any) -> Any:
|
def parse_book(self, response: Response, **kwargs: Any) -> Any:
|
||||||
# for a in response.copy().css('div.article a').getall():
|
# for a in response.copy().css('div.article a').getall():
|
||||||
# logger.info(a)
|
# logger.info(a)
|
||||||
|
|
||||||
for a in response.css('ul.resource-list a').getall():
|
# book = response.url.split(r'/')[-1]
|
||||||
if 'Chapter' in a:
|
|
||||||
self.crawler.stats.inc_value('chapters')
|
|
||||||
# logger.info(f"Chapter: {a}")
|
|
||||||
# yield response.follow(a, callback=self.parse_chapter)
|
|
||||||
|
|
||||||
if 'Verse' in a:
|
for a in response.css('ul.resource-list a'):
|
||||||
self.crawler.stats.inc_value('sections')
|
logger.debug(a)
|
||||||
# logger.info(f"Verse: {a}")
|
# if 'Chapter' in a:
|
||||||
# yield response.follow(a, callback=self.parse_verses)
|
self.crawler.stats.inc_value('chapters')
|
||||||
|
# logger.info(f"Chapter: {a}")
|
||||||
|
|
||||||
|
if 'CHAPTER' in a.css('::text')[0].get().upper():
|
||||||
|
yield response.follow(a, callback=self.get_chapter)
|
||||||
|
else:
|
||||||
|
yield response.follow(a, callback=self.get_data)
|
||||||
|
# break
|
||||||
|
# if 'Verse' in a:
|
||||||
|
# self.crawler.stats.inc_value('sections')
|
||||||
|
# logger.info(f"Verse: {a}")
|
||||||
|
# yield response.follow(a, callback=self.parse_verses)
|
||||||
|
|
||||||
|
file_name = response.url.split('/')[-1].replace('.', '_') + '.html'
|
||||||
|
logger.warning(file_name)
|
||||||
|
data = response.css('div.article p').getall()
|
||||||
|
self.write_data(file_name, data)
|
||||||
|
|
||||||
|
def get_data(self, response: Response, **kwargs: Any) -> Any:
|
||||||
|
file_name = response.url.split('/')[-1].replace('.', '_') + '.html'
|
||||||
|
logger.warning(file_name)
|
||||||
|
data = response.css('div.article p').getall()
|
||||||
|
self.write_data(file_name, data)
|
||||||
|
|
||||||
|
def get_chapter(self, response: Response, **kwargs: Any) -> Any:
|
||||||
|
file_name = response.url.split('/')[-1].replace('.', '_') + '.html'
|
||||||
|
file_name = f'chapter_{file_name}'
|
||||||
|
logger.warning(file_name)
|
||||||
|
data = response.css('div.article p').getall()
|
||||||
|
self.write_data(file_name, data)
|
||||||
|
|
||||||
|
def write_data(self, filename, data):
|
||||||
|
file = self.base_folder / filename
|
||||||
|
file.parent.mkdir(exist_ok=True, parents=True)
|
||||||
|
|
||||||
|
with open(file, 'w+') as f:
|
||||||
|
for d in data:
|
||||||
|
f.write(d)
|
||||||
|
f.write('\n')
|
||||||
|
101995
v/vol_1.xml
101995
v/vol_1.xml
File diff suppressed because it is too large
Load Diff
126303
v/vol_2.xml
126303
v/vol_2.xml
File diff suppressed because it is too large
Load Diff
167880
v/vol_4.xml
167880
v/vol_4.xml
File diff suppressed because it is too large
Load Diff
142852
v/vol_5.xml
142852
v/vol_5.xml
File diff suppressed because it is too large
Load Diff
131252
v/vol_6.xml
131252
v/vol_6.xml
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user