current
This commit is contained in:
parent
22e3584494
commit
8f46274857
572
main_1.py
Normal file
572
main_1.py
Normal file
@ -0,0 +1,572 @@
|
||||
import collections
|
||||
import itertools
|
||||
import re
|
||||
import string
|
||||
import sys
|
||||
from html.parser import HTMLParser
|
||||
from pathlib import Path
|
||||
|
||||
import pythonbible as bible
|
||||
from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
from tqdm import tqdm
|
||||
|
||||
base = Path('./scraps_consolidated').absolute()
|
||||
|
||||
# logger.remove(0)
|
||||
|
||||
from functools import partialmethod
|
||||
|
||||
logger.__class__.foobar = partialmethod(logger.__class__.log, "foobar", level='WARNING')
|
||||
logger.add('log.txt', level='WARNING')
|
||||
|
||||
shorts = dict(
|
||||
[
|
||||
("Nahum", "Nah"),
|
||||
("Colossians", "Col"),
|
||||
("Esther", "Esth"),
|
||||
("James", "Jas"),
|
||||
("John", "John"),
|
||||
("3John", "3John"),
|
||||
("Revelation", "Rev"),
|
||||
("Titus", "Titus"),
|
||||
("Zechariah", "Zech"),
|
||||
("Malachi", "Mal"),
|
||||
("Lamentations", "Lam"),
|
||||
("Matthew", "Matt"),
|
||||
("Jonah", "Jonah"),
|
||||
("Hosea", "Hos"),
|
||||
("Joel", "Joel"),
|
||||
("Genesis", "Gen"),
|
||||
("Proverbs", "Prov"),
|
||||
("Romans", "Rom"),
|
||||
("Micah", "Mic"),
|
||||
("Jeremiah", "Jer"),
|
||||
("Ecclesiastes", "Eccl"),
|
||||
("Philippians", "Phil"),
|
||||
("Galatians", "Gal"),
|
||||
("Acts", "Acts"),
|
||||
("Psalms", "Ps"),
|
||||
("Isaiah", "Isa"),
|
||||
("2Chronicles", "2Chr"),
|
||||
("Job", "Job"),
|
||||
("Deuteronomy", "Deut"),
|
||||
("Luke", "Luke"),
|
||||
("Numbers", "Num"),
|
||||
("Daniel", "Dan"),
|
||||
("Hebrews", "Heb"),
|
||||
("Nehemiah", "Neh"),
|
||||
("1John", "1John"),
|
||||
("Haggai", "Hag"),
|
||||
("1Kings", "1Kgs"),
|
||||
("Joshua", "Josh"),
|
||||
("1Corinthians", "1Cor"),
|
||||
("Mark", "Mark"),
|
||||
("2Kings", "2Kgs"),
|
||||
("Obadiah", "Obad"),
|
||||
("Ephesians", "Eph"),
|
||||
("1Timothy", "1Tim"),
|
||||
("Philemon", "Phlm"),
|
||||
("1Chronicles", "1Chr"),
|
||||
("2Samuel", "2Sam"),
|
||||
("1Samuel", "1Sam"),
|
||||
("2Peter", "2Pet"),
|
||||
("2John", "2John"),
|
||||
("Song of Solomon", "Song"),
|
||||
("Jude", "Jude"),
|
||||
("Amos", "Amos"),
|
||||
("1Peter", "1Pet"),
|
||||
("Leviticus", "Lev"),
|
||||
("Habakkuk", "Hab"),
|
||||
("Ruth", "Ruth"),
|
||||
("1Thessalonians", "1Thess"),
|
||||
("Ezra", "Ezra"),
|
||||
("Ezekiel", "Ezek"),
|
||||
("2Thessalonians", "2Thess"),
|
||||
("2Timothy", "2Tim"),
|
||||
("Exodus", "Exod"),
|
||||
("2Corinthians", "2Cor"),
|
||||
("Judges", "Judg"),
|
||||
("Zephaniah", "Zeph"),
|
||||
]
|
||||
)
|
||||
|
||||
end_verses = dict([
|
||||
('Obadiah', 21),
|
||||
('Philemon', 25),
|
||||
('2John', 13),
|
||||
('3John', 15),
|
||||
('Jude', 25),
|
||||
])
|
||||
|
||||
class MyHTMLParser(HTMLParser):
|
||||
file = None
|
||||
passage_verse = None
|
||||
tag = None
|
||||
tag_type = None # 'start' or 'end'
|
||||
skip_re = False
|
||||
attrs = dict()
|
||||
|
||||
# All the patterns are for re.sub(). This is specifically for getting
|
||||
# list/sublist into the correct position. The first item is for the pattern
|
||||
# to find and the second is the replacement pattern. The replacement pattern
|
||||
# also ids the list so that when the self.clean_file() we can properly add
|
||||
# the correct amount of `\t` to the line.
|
||||
patterns = [
|
||||
(r"\[(\d+\.)\]", lambda x: f'\n{x.group(1)}~4'),
|
||||
(r"\((\d+\.)\)", lambda x: f'\n{x.group(1)}~3'),
|
||||
|
||||
# These share the same id. There are times were the two are a mixed
|
||||
# within the parent lists.
|
||||
(r"^\b(\d+\.)", lambda x: f'\n{x.group(1)}~2'),
|
||||
(r"(\S\s)(\d+\.)(\s\b|\W)",
|
||||
lambda x: f'{x.group(1)}\n{x.group(2)}~2{x.group(3)}'),
|
||||
|
||||
|
||||
(r"(?=[MDCLXVI])M*(C[MD]|D?C{0,3})(X[CL]|L?X{0,3})(I[XV]|V?I{0,3})\.",
|
||||
lambda x: f'\n{parser.roman_to_int(x.group())}.~1'),
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def roman_to_int(number: str) -> int:
|
||||
if number is None:
|
||||
return 0
|
||||
|
||||
# Sometimes the roman numeral comes with a '.'. Striping to not cause
|
||||
# issues
|
||||
number = number.strip('.')
|
||||
|
||||
roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
|
||||
total = 0
|
||||
for i in range(len(number) - 1, -1, -1):
|
||||
num = roman[number[i]]
|
||||
if 3 * num < total:
|
||||
total -= num
|
||||
else:
|
||||
total += num
|
||||
|
||||
return total
|
||||
|
||||
def create_md_file(self, base_dir: Path, folder: str, file: str):
|
||||
""" Create the path of the md file that will be written to as the
|
||||
program process the xml file.
|
||||
"""
|
||||
|
||||
md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
|
||||
md_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
md_file.unlink(missing_ok=True)
|
||||
# print(md_file.absolute())
|
||||
self.file = md_file
|
||||
return md_file
|
||||
|
||||
def write_to_file(self, data, skip_re=False):
|
||||
|
||||
# if self.file is None:
|
||||
# raise ValueError('No File specified')
|
||||
|
||||
if not skip_re:
|
||||
# Here the list/sublist are searched for and altered for later
|
||||
# processing in self.clean_file()
|
||||
for pattern, sub in self.patterns:
|
||||
# logger.debug(f'Running Pattern: {pattern} on {data[:50]}')
|
||||
data = re.sub(pattern, sub, data)
|
||||
|
||||
# print(f'writing: {data!r}')
|
||||
|
||||
if self.file:
|
||||
# raise ValueError('No File specified')
|
||||
with open(self.file, 'a+') as file:
|
||||
file.write(data)
|
||||
else:
|
||||
print(data)
|
||||
|
||||
def clean_file(self):
|
||||
logger.info(' CLEANING FILE')
|
||||
|
||||
if self.file is None:
|
||||
raise ValueError('No File specified')
|
||||
|
||||
# Read all the lines
|
||||
with open(self.file, 'r') as file:
|
||||
lines = file.readlines()
|
||||
|
||||
indent = 0 # Current number of '\t' to add in front of the list number
|
||||
sublist = dict() # Track the indent number based on the id.
|
||||
with open(self.file, 'w') as file:
|
||||
file.write('# Chapter Introduction\n\n')
|
||||
|
||||
header = None
|
||||
for line in lines:
|
||||
# Do not write blank lines
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
# Main header, if a header has already been written skip it
|
||||
if line.strip().startswith('# '):
|
||||
if header == line.strip():
|
||||
continue
|
||||
header = line.strip()
|
||||
|
||||
# Add a `\n` if line is a heading.
|
||||
if line.startswith('#'):
|
||||
line = f'\n{line.strip()}'
|
||||
|
||||
# If line is not a list then add a `\n` after the paragraph,
|
||||
# and reset the indent tracker
|
||||
if line.strip()[0] not in string.digits:
|
||||
line = f'{line}\n'
|
||||
# print('resetting indent')
|
||||
indent = 0
|
||||
sublist.clear()
|
||||
else:
|
||||
# Get the list id
|
||||
start = line.find('~')
|
||||
end = start + 2
|
||||
list_id = line[start:end]
|
||||
|
||||
# Get the number of indents based on the list id, if it does
|
||||
# not exist they set the value to the max value +1. If this
|
||||
# is the first it will be set to 0 (-1 +1 = 0)
|
||||
indents = "\t" * sublist.setdefault(
|
||||
list_id, max(sublist.values(), default=-1)+1
|
||||
)
|
||||
line = f'{indents}{line.replace(list_id, "")}'
|
||||
|
||||
if line.startswith('Passage: '):
|
||||
line = line.removeprefix('Passage: ')
|
||||
|
||||
file.write(line)
|
||||
|
||||
def normalize_osis_verses(self, verses):
|
||||
"""Takes this
|
||||
'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5'
|
||||
and turns it into somthing more readable"""
|
||||
normalized_refs = list()
|
||||
for chunks in verses.removeprefix("Bible:").split('Bible:'):
|
||||
spanned_verses = list()
|
||||
for ref in chunks.split('-'):
|
||||
verse = ref.replace('.', ' ', 1).replace('.', ':')
|
||||
book = ref.split('.')[0]
|
||||
verse.replace(book, convert[book])
|
||||
spanned_verses.append(
|
||||
bible.format_scripture_references(
|
||||
bible.get_references(verse.strip())
|
||||
)
|
||||
)
|
||||
|
||||
try:
|
||||
normalized = bible.format_scripture_references(
|
||||
bible.get_references('-'.join(spanned_verses))
|
||||
)
|
||||
except:
|
||||
logger.warning(f"Error with: {verses=}: {spanned_verses=}")
|
||||
raise
|
||||
|
||||
if normalized is False:
|
||||
print(f"Error with: {verses=}: {spanned_verses=}")
|
||||
sys.exit()
|
||||
|
||||
normalized_refs.append(normalized)
|
||||
|
||||
return ';'.join(normalized_refs)
|
||||
|
||||
# @staticmethod
|
||||
def old_normalize_osis_verses(self, verses):
|
||||
"""Takes this
|
||||
'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5'
|
||||
and turns it into somthing more readable"""
|
||||
normalized_refs = list()
|
||||
for chunks in verses.removeprefix("Bible:").split('Bible:'):
|
||||
spanned_verses = chunks.split('-')
|
||||
if len(spanned_verses) < 2:
|
||||
spanned_verses = list(itertools.repeat(spanned_verses[0], 2))
|
||||
|
||||
book, start_chapter, start_verse = spanned_verses[0].split('.')
|
||||
_, end_chapter, end_verse = spanned_verses[1].split('.')
|
||||
book = bible.Book[convert[book].upper()]
|
||||
|
||||
normalized = bible.NormalizedReference(
|
||||
book=book,
|
||||
start_chapter=start_chapter,
|
||||
start_verse=start_verse,
|
||||
end_chapter=end_chapter,
|
||||
end_verse=end_verse,
|
||||
)
|
||||
|
||||
if normalized is False:
|
||||
logger.foobar(f'{self.args=}')
|
||||
|
||||
normalized_refs.append(
|
||||
bible.format_scripture_references([normalized]).strip()
|
||||
)
|
||||
|
||||
return ';'.join(normalized_refs)
|
||||
|
||||
@staticmethod
|
||||
@logger.catch(reraise=False)
|
||||
def obsidian_links(verses):
|
||||
"""Take `Book Chapter:Ver-Ver and turn it into a obsidian bible link"""
|
||||
links = []
|
||||
for verse in verses.split(';'):
|
||||
verse = verse.replace(':', '#').replace('-', '..')
|
||||
if verse[0] in string.digits:
|
||||
verse.replace(' ', '', 1)
|
||||
links.append(f"[[{verse}]]")
|
||||
|
||||
return '; '.join(links)
|
||||
|
||||
def convert_passage(self, verses):
|
||||
book = self.file.parts[-2]
|
||||
chapter = None
|
||||
ref = []
|
||||
for ver in verses.split(','):
|
||||
try:
|
||||
chapter, v = ver.split(':')
|
||||
except:
|
||||
v = ver
|
||||
|
||||
r = []
|
||||
for i in v.split('-'):
|
||||
r.append(f'{book}.{chapter}.{i}')
|
||||
|
||||
ref.append(f'Bible:{"-".join(r)}')
|
||||
x = ' '.join(ref)
|
||||
return f'osisRef="{x}"'
|
||||
|
||||
def process_tag(self, test=None):
|
||||
match self.tag:
|
||||
case 'b': # Bold
|
||||
self.write_to_file('**')
|
||||
|
||||
case 'i': # Italics
|
||||
self.write_to_file('*')
|
||||
|
||||
case 'scripcom':
|
||||
# This saves the verse for the section.
|
||||
if self.tag_type == 'start':
|
||||
if self.attrs['type'] == 'Commentary':
|
||||
self.passage_verse = self.attrs['osisref']
|
||||
|
||||
case 'h4': # this is the section header. Need to keep track of it per file.
|
||||
if self.tag_type == 'start':
|
||||
self.write_to_file('\n# ',)
|
||||
self.skip_re = True
|
||||
|
||||
if self.tag_type == 'end':
|
||||
self.write_to_file('\n')
|
||||
self.skip_re = False
|
||||
|
||||
case 'scripref': # Scripture ref
|
||||
# get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')]
|
||||
# bible.format_scripture_references(bible.get_references(attrs['osisref']))
|
||||
if self.tag_type == 'start':
|
||||
if 'osisref' in self.attrs:
|
||||
verses = self.normalize_osis_verses(
|
||||
self.attrs['osisref']
|
||||
)
|
||||
else:
|
||||
# verses = self.normalize_osis_verses(
|
||||
# self.convert_passage(self.attrs['passage'])
|
||||
# )
|
||||
raise ValueError('No `osisref` in tag attrs')
|
||||
|
||||
self.write_to_file(self.obsidian_links(verses))
|
||||
|
||||
case 'p': # Paragraph
|
||||
if self.tag_type == 'start':
|
||||
if self.attrs.get('class', False) == 'passage':
|
||||
if self.passage_verse:
|
||||
try:
|
||||
verse = bible.get_references(
|
||||
self.normalize_osis_verses(
|
||||
self.passage_verse)
|
||||
)[0]
|
||||
except:
|
||||
logger.debug(self.passage_verse)
|
||||
raise
|
||||
|
||||
if verse.start_verse == verse.end_verse:
|
||||
self.write_to_file('## Verse: ')
|
||||
self.write_to_file(
|
||||
f'{verse.start_verse}'
|
||||
)
|
||||
else:
|
||||
self.write_to_file('## Verses: ')
|
||||
self.write_to_file(
|
||||
f'{verse.start_verse} - {verse.end_verse}'
|
||||
)
|
||||
self.write_to_file('\n\n')
|
||||
|
||||
# Need this do the regex in self.clean does not pick
|
||||
# this up. Will be also cleaned in the same function.
|
||||
self.write_to_file('Passage: ')
|
||||
|
||||
if self.tag_type == 'end':
|
||||
self.write_to_file('\n\n')
|
||||
|
||||
case 'li':
|
||||
if self.tag_type == 'start':
|
||||
self.write_to_file('- ')
|
||||
else:
|
||||
self.write_to_file('\n')
|
||||
|
||||
# case _:
|
||||
# # print(tag.upper())
|
||||
# pass
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
# Set taf trackers
|
||||
self.tag_type = 'start'
|
||||
self.tag = tag
|
||||
self.attrs = dict(attrs)
|
||||
|
||||
self.process_tag()
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
# print(f'End: {tag}')
|
||||
self.tag = tag
|
||||
self.tag_type = 'end'
|
||||
self.process_tag()
|
||||
|
||||
# Clear tag tracker
|
||||
self.tag = None
|
||||
self.tag_type = None
|
||||
self.attrs.clear()
|
||||
self.skip_re = False
|
||||
|
||||
def handle_data(self, data):
|
||||
# print(f'Data: {data!r}')
|
||||
|
||||
if self.tag in ['scripref', 'h2', 'h3', 'h5']:
|
||||
return
|
||||
|
||||
if data == '\n':
|
||||
return
|
||||
|
||||
data = data.replace('—', '-- ')
|
||||
data = data.replace('GOD', 'God')
|
||||
data = data.replace('\n', ' ')
|
||||
|
||||
self.write_to_file(data, self.skip_re)
|
||||
|
||||
|
||||
def create_file(base_dir: Path, folder: str, file: str, file_type: str) -> Path:
|
||||
""" Create the path of the md file that will be written to as the
|
||||
program process the xml file.
|
||||
"""
|
||||
|
||||
new_file = Path(f'{base_dir}/{folder}/{file}.{file_type}').absolute()
|
||||
new_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
new_file.unlink(missing_ok=True)
|
||||
# print(md_file.absolute())
|
||||
return new_file
|
||||
|
||||
|
||||
def write_file(file, data):
|
||||
with open(file, 'a+') as f:
|
||||
for d in data:
|
||||
f.write(d)
|
||||
f.write('\n')
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = MyHTMLParser()
|
||||
|
||||
# Process each volume file.
|
||||
scraps = Path('./scraps')
|
||||
files_used = collections.Counter()
|
||||
for vol in sorted(Path('./').glob('*.xml')):
|
||||
print(vol)
|
||||
soup = BeautifulSoup(vol.read_text(), 'xml')
|
||||
for book in soup.find_all('div1'):
|
||||
book_name = book['title'].replace('First ', '1').replace('Second ', '2').replace('Third ', '3')
|
||||
|
||||
if shorts.get(book_name, False) is False:
|
||||
print(f'skipping {book_name}')
|
||||
continue
|
||||
|
||||
print('\t', book_name, book['id'])
|
||||
for chapter in book.find_all('div2'):
|
||||
filename = chapter['title']
|
||||
|
||||
if 'Index' in filename:
|
||||
print('skipping', filename)
|
||||
continue
|
||||
|
||||
# Remove Roman Numerals from file name
|
||||
if 'CHAPTER' in filename.upper():
|
||||
_, roman_num = chapter['title'].split(' ')
|
||||
chapter_num = parser.roman_to_int(roman_num)
|
||||
filename = f'Chapter {chapter_num}'
|
||||
|
||||
print('\t\t', filename, chapter['id'])
|
||||
|
||||
file = create_file(base, book_name, filename, 'html')
|
||||
file.touch()
|
||||
|
||||
# book intro file
|
||||
if 'INTRODUCTION' in file.name.upper():
|
||||
book_intro = scraps / f"{book_name}.html"
|
||||
if book_intro.exists() is False:
|
||||
print(book_intro.name, book_intro.exists())
|
||||
files_used.update([book_intro.name])
|
||||
file.write_bytes(book_intro.read_bytes())
|
||||
continue
|
||||
|
||||
# Chapter intro
|
||||
book_short = shorts[book_name]
|
||||
|
||||
ci = bible.get_references(f"{book_name} {chapter_num}")[0]
|
||||
if ev:=end_verses.get(book_name, False):
|
||||
ci.end_verse = ev
|
||||
|
||||
chapter_into = scraps / f"{book_short}_{chapter_num}_{ci.start_verse}-{book_short}_{chapter_num}_{ci.end_verse}.html"
|
||||
|
||||
if chapter_into.exists() is False:
|
||||
print(chapter_into.name, chapter_into.exists())
|
||||
raise ValueError
|
||||
files_used.update([chapter_into.name])
|
||||
|
||||
# write(file, '<h1>Chapter Introduction</h1>')
|
||||
write_file(file, '<!-- Chapter Intro -->')
|
||||
write_file(file, chapter_into.read_text())
|
||||
|
||||
# Chapter sections
|
||||
for passage in chapter.find_all('p', {'class': 'passage'}):
|
||||
# print(passage.text)
|
||||
new_t = soup.new_tag('p', attrs=passage.attrs)
|
||||
new_t.string = passage.text
|
||||
passage.replace_with(new_t)
|
||||
|
||||
write_file(file, '<!-- Chapter Sections -->')
|
||||
|
||||
for section in chapter.find_all('div', {'class': 'Commentary'}):
|
||||
if header := section.find('h4'):
|
||||
write_file(file, str(header))
|
||||
|
||||
# verses_id = str(section['id'])
|
||||
# verses = verses_id.replace('Bible:', '').replace('.', '_')
|
||||
#
|
||||
# verse_section = scraps / f"{verses}.html"
|
||||
# if verse_section.exists() is False:
|
||||
|
||||
for sc in section.previous_elements:
|
||||
if str(sc).strip():
|
||||
break
|
||||
# print(sc['osisRef'])
|
||||
|
||||
verses_id = str(sc['osisRef'])
|
||||
verses = verses_id.replace('Bible:', '').replace('.', '_')
|
||||
|
||||
verse_section = scraps / f"{verses}.html"
|
||||
print('\t\t',verse_section.name, verse_section.exists())
|
||||
if verse_section.exists() is False:
|
||||
raise ValueError(f'osisRef="{verses_id}"')
|
||||
|
||||
files_used.update([verse_section.name])
|
||||
|
||||
# print(verses)
|
||||
# passage = section.find('p', {'class': 'passage'})
|
||||
|
||||
|
||||
|
||||
|
579
main_2.py
Normal file
579
main_2.py
Normal file
@ -0,0 +1,579 @@
|
||||
import collections
|
||||
import itertools
|
||||
import re
|
||||
import string
|
||||
import sys
|
||||
from html.parser import HTMLParser
|
||||
from pathlib import Path
|
||||
|
||||
import pythonbible as bible
|
||||
from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
from tqdm import tqdm
|
||||
import difflib
|
||||
|
||||
base = Path('./scraps_consolidated').absolute()
|
||||
|
||||
logger.remove(0)
|
||||
logger.add('mixing.log')
|
||||
from functools import partialmethod
|
||||
|
||||
logger.__class__.foobar = partialmethod(logger.__class__.log, "foobar", level='WARNING')
|
||||
logger.add('log.txt', level='WARNING')
|
||||
|
||||
shorts = dict(
|
||||
[
|
||||
("Nahum", "Nah"),
|
||||
("Colossians", "Col"),
|
||||
("Esther", "Esth"),
|
||||
("James", "Jas"),
|
||||
("John", "John"),
|
||||
("3John", "3John"),
|
||||
("Revelation", "Rev"),
|
||||
("Titus", "Titus"),
|
||||
("Zechariah", "Zech"),
|
||||
("Malachi", "Mal"),
|
||||
("Lamentations", "Lam"),
|
||||
("Matthew", "Matt"),
|
||||
("Jonah", "Jonah"),
|
||||
("Hosea", "Hos"),
|
||||
("Joel", "Joel"),
|
||||
("Genesis", "Gen"),
|
||||
("Proverbs", "Prov"),
|
||||
("Romans", "Rom"),
|
||||
("Micah", "Mic"),
|
||||
("Jeremiah", "Jer"),
|
||||
("Ecclesiastes", "Eccl"),
|
||||
("Philippians", "Phil"),
|
||||
("Galatians", "Gal"),
|
||||
("Acts", "Acts"),
|
||||
("Psalms", "Ps"),
|
||||
("Isaiah", "Isa"),
|
||||
("2Chronicles", "2Chr"),
|
||||
("Job", "Job"),
|
||||
("Deuteronomy", "Deut"),
|
||||
("Luke", "Luke"),
|
||||
("Numbers", "Num"),
|
||||
("Daniel", "Dan"),
|
||||
("Hebrews", "Heb"),
|
||||
("Nehemiah", "Neh"),
|
||||
("1John", "1John"),
|
||||
("Haggai", "Hag"),
|
||||
("1Kings", "1Kgs"),
|
||||
("Joshua", "Josh"),
|
||||
("1Corinthians", "1Cor"),
|
||||
("Mark", "Mark"),
|
||||
("2Kings", "2Kgs"),
|
||||
("Obadiah", "Obad"),
|
||||
("Ephesians", "Eph"),
|
||||
("1Timothy", "1Tim"),
|
||||
("Philemon", "Phlm"),
|
||||
("1Chronicles", "1Chr"),
|
||||
("2Samuel", "2Sam"),
|
||||
("1Samuel", "1Sam"),
|
||||
("2Peter", "2Pet"),
|
||||
("2John", "2John"),
|
||||
("Song of Solomon", "Song"),
|
||||
("Jude", "Jude"),
|
||||
("Amos", "Amos"),
|
||||
("1Peter", "1Pet"),
|
||||
("Leviticus", "Lev"),
|
||||
("Habakkuk", "Hab"),
|
||||
("Ruth", "Ruth"),
|
||||
("1Thessalonians", "1Thess"),
|
||||
("Ezra", "Ezra"),
|
||||
("Ezekiel", "Ezek"),
|
||||
("2Thessalonians", "2Thess"),
|
||||
("2Timothy", "2Tim"),
|
||||
("Exodus", "Exod"),
|
||||
("2Corinthians", "2Cor"),
|
||||
("Judges", "Judg"),
|
||||
("Zephaniah", "Zeph"),
|
||||
]
|
||||
)
|
||||
|
||||
end_verses = dict([
|
||||
('Obadiah', 21),
|
||||
('Philemon', 25),
|
||||
('2John', 13),
|
||||
('3John', 15),
|
||||
('Jude', 25),
|
||||
])
|
||||
|
||||
class MyHTMLParser(HTMLParser):
|
||||
file = None
|
||||
passage_verse = None
|
||||
tag = None
|
||||
tag_type = None # 'start' or 'end'
|
||||
skip_re = False
|
||||
attrs = dict()
|
||||
|
||||
# All the patterns are for re.sub(). This is specifically for getting
|
||||
# list/sublist into the correct position. The first item is for the pattern
|
||||
# to find and the second is the replacement pattern. The replacement pattern
|
||||
# also ids the list so that when the self.clean_file() we can properly add
|
||||
# the correct amount of `\t` to the line.
|
||||
patterns = [
|
||||
(r"\[(\d+\.)\]", lambda x: f'\n{x.group(1)}~4'),
|
||||
(r"\((\d+\.)\)", lambda x: f'\n{x.group(1)}~3'),
|
||||
|
||||
# These share the same id. There are times were the two are a mixed
|
||||
# within the parent lists.
|
||||
(r"^\b(\d+\.)", lambda x: f'\n{x.group(1)}~2'),
|
||||
(r"(\S\s)(\d+\.)(\s\b|\W)",
|
||||
lambda x: f'{x.group(1)}\n{x.group(2)}~2{x.group(3)}'),
|
||||
|
||||
|
||||
(r"(?=[MDCLXVI])M*(C[MD]|D?C{0,3})(X[CL]|L?X{0,3})(I[XV]|V?I{0,3})\.",
|
||||
lambda x: f'\n{parser.roman_to_int(x.group())}.~1'),
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def roman_to_int(number: str) -> int:
|
||||
if number is None:
|
||||
return 0
|
||||
|
||||
# Sometimes the roman numeral comes with a '.'. Striping to not cause
|
||||
# issues
|
||||
number = number.strip('.')
|
||||
|
||||
roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
|
||||
total = 0
|
||||
for i in range(len(number) - 1, -1, -1):
|
||||
num = roman[number[i]]
|
||||
if 3 * num < total:
|
||||
total -= num
|
||||
else:
|
||||
total += num
|
||||
|
||||
return total
|
||||
|
||||
def create_md_file(self, base_dir: Path, folder: str, file: str):
|
||||
""" Create the path of the md file that will be written to as the
|
||||
program process the xml file.
|
||||
"""
|
||||
|
||||
md_file = Path(f'{base_dir}/{folder}/{file}.md').absolute()
|
||||
md_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
md_file.unlink(missing_ok=True)
|
||||
# logger.info(md_file.absolute())
|
||||
self.file = md_file
|
||||
return md_file
|
||||
|
||||
def write_to_file(self, data, skip_re=False):
|
||||
|
||||
# if self.file is None:
|
||||
# raise ValueError('No File specified')
|
||||
|
||||
if not skip_re:
|
||||
# Here the list/sublist are searched for and altered for later
|
||||
# processing in self.clean_file()
|
||||
for pattern, sub in self.patterns:
|
||||
# logger.debug(f'Running Pattern: {pattern} on {data[:50]}')
|
||||
data = re.sub(pattern, sub, data)
|
||||
|
||||
# logger.info(f'writing: {data!r}')
|
||||
|
||||
if self.file:
|
||||
# raise ValueError('No File specified')
|
||||
with open(self.file, 'a+') as file:
|
||||
file.write(data)
|
||||
else:
|
||||
logger.info(data)
|
||||
|
||||
def clean_file(self):
|
||||
logger.info(' CLEANING FILE')
|
||||
|
||||
if self.file is None:
|
||||
raise ValueError('No File specified')
|
||||
|
||||
# Read all the lines
|
||||
with open(self.file, 'r') as file:
|
||||
lines = file.readlines()
|
||||
|
||||
indent = 0 # Current number of '\t' to add in front of the list number
|
||||
sublist = dict() # Track the indent number based on the id.
|
||||
with open(self.file, 'w') as file:
|
||||
file.write('# Chapter Introduction\n\n')
|
||||
|
||||
header = None
|
||||
for line in lines:
|
||||
# Do not write blank lines
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
# Main header, if a header has already been written skip it
|
||||
if line.strip().startswith('# '):
|
||||
if header == line.strip():
|
||||
continue
|
||||
header = line.strip()
|
||||
|
||||
# Add a `\n` if line is a heading.
|
||||
if line.startswith('#'):
|
||||
line = f'\n{line.strip()}'
|
||||
|
||||
# If line is not a list then add a `\n` after the paragraph,
|
||||
# and reset the indent tracker
|
||||
if line.strip()[0] not in string.digits:
|
||||
line = f'{line}\n'
|
||||
# logger.info('resetting indent')
|
||||
indent = 0
|
||||
sublist.clear()
|
||||
else:
|
||||
# Get the list id
|
||||
start = line.find('~')
|
||||
end = start + 2
|
||||
list_id = line[start:end]
|
||||
|
||||
# Get the number of indents based on the list id, if it does
|
||||
# not exist they set the value to the max value +1. If this
|
||||
# is the first it will be set to 0 (-1 +1 = 0)
|
||||
indents = "\t" * sublist.setdefault(
|
||||
list_id, max(sublist.values(), default=-1)+1
|
||||
)
|
||||
line = f'{indents}{line.replace(list_id, "")}'
|
||||
|
||||
if line.startswith('Passage: '):
|
||||
line = line.removeprefix('Passage: ')
|
||||
|
||||
file.write(line)
|
||||
|
||||
def normalize_osis_verses(self, verses):
|
||||
"""Takes this
|
||||
'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Col.1.16 Bible:Heb.1.2 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5'
|
||||
and turns it into somthing more readable"""
|
||||
normalized_refs = list()
|
||||
for chunks in verses.removeprefix("Bible:").split('Bible:'):
|
||||
spanned_verses = list()
|
||||
for ref in chunks.split('-'):
|
||||
verse = ref.replace('.', ' ', 1).replace('.', ':')
|
||||
book = ref.split('.')[0]
|
||||
spanned_verses.append(
|
||||
bible.format_scripture_references(
|
||||
bible.get_references(verse.strip())
|
||||
)
|
||||
)
|
||||
|
||||
try:
|
||||
normalized = bible.format_scripture_references(
|
||||
bible.get_references('-'.join(spanned_verses))
|
||||
)
|
||||
except:
|
||||
logger.warning(f"Error with: {verses=}: {spanned_verses=}")
|
||||
raise
|
||||
|
||||
if normalized is False:
|
||||
logger.info(f"Error with: {verses=}: {spanned_verses=}")
|
||||
sys.exit()
|
||||
|
||||
normalized_refs.append(normalized)
|
||||
|
||||
return ';'.join(normalized_refs)
|
||||
|
||||
# @staticmethod
|
||||
def old_normalize_osis_verses(self, verses):
|
||||
"""Takes this
|
||||
'Bible:John.1.3 Bible:John.1.10 Bible:Eph.3.9 Bible:Acts.17.24-Acts.17.25 Bible:Exod.5.26 Bible:Gen.5'
|
||||
and turns it into somthing more readable"""
|
||||
normalized_refs = list()
|
||||
for chunks in verses.removeprefix("Bible:").split('Bible:'):
|
||||
spanned_verses = chunks.split('-')
|
||||
if len(spanned_verses) < 2:
|
||||
spanned_verses = list(itertools.repeat(spanned_verses[0], 2))
|
||||
|
||||
book, start_chapter, start_verse = spanned_verses[0].split('.')
|
||||
_, end_chapter, end_verse = spanned_verses[1].split('.')
|
||||
book = bible.Book[convert[book].upper()]
|
||||
|
||||
normalized = bible.NormalizedReference(
|
||||
book=book,
|
||||
start_chapter=start_chapter,
|
||||
start_verse=start_verse,
|
||||
end_chapter=end_chapter,
|
||||
end_verse=end_verse,
|
||||
)
|
||||
|
||||
if normalized is False:
|
||||
logger.foobar(f'{self.args=}')
|
||||
|
||||
normalized_refs.append(
|
||||
bible.format_scripture_references([normalized]).strip()
|
||||
)
|
||||
|
||||
return ';'.join(normalized_refs)
|
||||
|
||||
@staticmethod
|
||||
@logger.catch(reraise=False)
|
||||
def obsidian_links(verses):
|
||||
"""Take `Book Chapter:Ver-Ver and turn it into a obsidian bible link"""
|
||||
links = []
|
||||
for verse in verses.split(';'):
|
||||
verse = verse.replace(':', '#').replace('-', '..')
|
||||
if verse[0] in string.digits:
|
||||
verse.replace(' ', '', 1)
|
||||
links.append(f"[[{verse}]]")
|
||||
|
||||
return '; '.join(links)
|
||||
|
||||
def convert_passage(self, verses):
|
||||
book = self.file.parts[-2]
|
||||
chapter = None
|
||||
ref = []
|
||||
for ver in verses.split(','):
|
||||
try:
|
||||
chapter, v = ver.split(':')
|
||||
except:
|
||||
v = ver
|
||||
|
||||
r = []
|
||||
for i in v.split('-'):
|
||||
r.append(f'{book}.{chapter}.{i}')
|
||||
|
||||
ref.append(f'Bible:{"-".join(r)}')
|
||||
x = ' '.join(ref)
|
||||
return f'osisRef="{x}"'
|
||||
|
||||
def process_tag(self, test=None):
|
||||
match self.tag:
|
||||
case 'b': # Bold
|
||||
self.write_to_file('**')
|
||||
|
||||
case 'i': # Italics
|
||||
self.write_to_file('*')
|
||||
|
||||
case 'scripcom':
|
||||
# This saves the verse for the section.
|
||||
if self.tag_type == 'start':
|
||||
try:
|
||||
verse = bible.get_references(
|
||||
self.normalize_osis_verses(
|
||||
self.attrs['osisref'])
|
||||
)[0]
|
||||
except:
|
||||
logger.debug(self.passage_verse)
|
||||
raise
|
||||
|
||||
if verse.start_verse == verse.end_verse:
|
||||
self.write_to_file('## Verse: ')
|
||||
self.write_to_file(
|
||||
f'{verse.start_verse}'
|
||||
)
|
||||
else:
|
||||
self.write_to_file('## Verses: ')
|
||||
self.write_to_file(
|
||||
f'{verse.start_verse} - {verse.end_verse}'
|
||||
)
|
||||
self.write_to_file('\n\n')
|
||||
|
||||
case 'h4': # this is the section header. Need to keep track of it per file.
|
||||
if self.tag_type == 'start':
|
||||
self.write_to_file('\n# ',)
|
||||
self.skip_re = True
|
||||
|
||||
if self.tag_type == 'end':
|
||||
self.write_to_file('\n')
|
||||
self.skip_re = False
|
||||
|
||||
# case 'scripref': # Scripture ref
|
||||
# # get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')]
|
||||
# # bible.format_scripture_references(bible.get_references(attrs['osisref']))
|
||||
# if self.tag_type == 'start':
|
||||
# if 'osisref' in self.attrs:
|
||||
# verses = self.normalize_osis_verses(
|
||||
# self.attrs['osisref']
|
||||
# )
|
||||
# else:
|
||||
# # verses = self.normalize_osis_verses(
|
||||
# # self.convert_passage(self.attrs['passage'])
|
||||
# # )
|
||||
# raise ValueError('No `osisref` in tag attrs')
|
||||
#
|
||||
# self.write_to_file(self.obsidian_links(verses))
|
||||
|
||||
case 'p': # Paragraph
|
||||
if self.tag_type == 'start':
|
||||
if self.attrs.get('class', False) == 'passage':
|
||||
# Need this so the regex in self.clean does not pick
|
||||
# this up. Will be also cleaned in the same function.
|
||||
self.write_to_file('Passage: ')
|
||||
|
||||
if self.tag_type == 'end':
|
||||
self.write_to_file('\n\n')
|
||||
|
||||
case 'li':
|
||||
if self.tag_type == 'start':
|
||||
self.write_to_file('- ')
|
||||
else:
|
||||
self.write_to_file('\n')
|
||||
|
||||
# case _:
|
||||
# # logger.info(tag.upper())
|
||||
# pass
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
# Set taf trackers
|
||||
self.tag_type = 'start'
|
||||
self.tag = tag
|
||||
self.attrs = dict(attrs)
|
||||
|
||||
self.process_tag()
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
# logger.info(f'End: {tag}')
|
||||
self.tag = tag
|
||||
self.tag_type = 'end'
|
||||
self.process_tag()
|
||||
|
||||
# Clear tag tracker
|
||||
self.tag = None
|
||||
self.tag_type = None
|
||||
self.attrs.clear()
|
||||
self.skip_re = False
|
||||
|
||||
def handle_data(self, data):
|
||||
# logger.info(f'Data: {data!r}')
|
||||
|
||||
if self.tag in ['scripref', 'h2', 'h3', 'h5']:
|
||||
return
|
||||
|
||||
if data == '\n':
|
||||
return
|
||||
|
||||
data = data.replace('—', '-- ')
|
||||
data = data.replace('GOD', 'God')
|
||||
data = data.replace('\n', ' ')
|
||||
|
||||
self.write_to_file(data, self.skip_re)
|
||||
|
||||
|
||||
def create_file(base_dir: Path, folder: str, file: str, file_type: str) -> Path:
|
||||
""" Create the path of the md file that will be written to as the
|
||||
program process the xml file.
|
||||
"""
|
||||
|
||||
new_file = Path(f'{base_dir}/{folder}/{file}.{file_type}').absolute()
|
||||
new_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
new_file.unlink(missing_ok=True)
|
||||
# logger.info(md_file.absolute())
|
||||
return new_file
|
||||
|
||||
|
||||
def write_file(file, data):
|
||||
with open(file, 'a+') as f:
|
||||
for d in data:
|
||||
f.write(d)
|
||||
f.write('\n')
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = MyHTMLParser()
|
||||
|
||||
# Process each volume file.
|
||||
scraps = Path('./scraps')
|
||||
files_used = list()
|
||||
missing = list()
|
||||
for book in Path('./vol_split').glob('*'):
|
||||
book_name = book.name.split(' - ')[-1]
|
||||
logger.info(book_name)
|
||||
print(book_name)
|
||||
for chapter in book.glob('*.xml'):
|
||||
chapter_name = chapter.stem
|
||||
chapter_num = chapter_name.split(' ')[-1]
|
||||
logger.info(f"{chapter_name}")
|
||||
|
||||
file = create_file(base, book_name, chapter_name, 'html')
|
||||
|
||||
# book intro file
|
||||
if 'INTRODUCTION' in chapter_name.upper():
|
||||
book_intro = scraps / f"{book_name}.html"
|
||||
if book_intro.exists() is False:
|
||||
logger.info(book_intro.name, book_intro.exists())
|
||||
files_used.append(book_intro.name)
|
||||
write_file(file, f'<!-- Gotten from {book_intro}-->')
|
||||
write_file(file, book_intro.read_text())
|
||||
parser.create_md_file('./mh_scraps/', book_name, file.stem)
|
||||
parser.feed(file.read_text())
|
||||
parser.clean_file()
|
||||
continue
|
||||
|
||||
# Chapter intro
|
||||
book_short = shorts[book_name]
|
||||
|
||||
ci = bible.get_references(f"{book_name} {chapter_num}")[0]
|
||||
if ev := end_verses.get(book_name, False):
|
||||
ci.end_verse = ev
|
||||
|
||||
start = f'{book_short}_{chapter_num}_{ci.start_verse}'
|
||||
end = f'{book_short}_{chapter_num}_{ci.end_verse}'
|
||||
chapter_into = scraps / f"chapter_{start}-{end}.html"
|
||||
|
||||
if chapter_into.exists() is False:
|
||||
logger.info(chapter_into.name, chapter_into.exists())
|
||||
raise ValueError
|
||||
files_used.append(chapter_into.name)
|
||||
|
||||
# write(file, '<h1>Chapter Introduction</h1>')
|
||||
write_file(file, f'<!-- Chapter Intro: {chapter_into} -->')
|
||||
write_file(file, chapter_into.read_text())
|
||||
|
||||
sections = BeautifulSoup(chapter.read_text(), 'xml')
|
||||
|
||||
# Parse the text of the chapter
|
||||
# May need to add a loop to look in all <p class='passage> and
|
||||
# unrap any scripref that is in them. the place Dan is getting confused
|
||||
# with the book Daniel.
|
||||
for passage in sections.find_all('p', {'class': 'passage'}):
|
||||
# print(passage.text)
|
||||
new_t = sections.new_tag('p', attrs=passage.attrs)
|
||||
new_t.string = passage.text
|
||||
passage.replace_with(new_t)
|
||||
|
||||
# Clearing all scripRef of internal text. That way if it
|
||||
# had other tags in it would not be processed
|
||||
for scripture in sections.find_all('scripRef'):
|
||||
scripture.clear()
|
||||
|
||||
write_file(file, '<!-- Chapter Sections -->')
|
||||
for section in sections.find_all('div', {'class': 'Commentary'}):
|
||||
if header := section.find('h4'):
|
||||
write_file(file, str(header))
|
||||
|
||||
# Scripture ref, should be the section!
|
||||
scripture_ref = section.previous_sibling
|
||||
# write_file(file, '\n')
|
||||
write_file(file, str(scripture_ref))
|
||||
|
||||
# Passage text
|
||||
if passage := section.find('p', {'class': 'passage'}):
|
||||
write_file(file, str(passage))
|
||||
|
||||
logger.info(scripture_ref)
|
||||
verses_id = str(scripture_ref['osisRef'])
|
||||
verses = verses_id.replace('Bible:', '').replace('.', '_')
|
||||
|
||||
# Sanity Check verses are legit
|
||||
for v in verses.split('-'):
|
||||
v = v.replace('_', ' ', 1).replace('_', ':')
|
||||
b_ref = bible.get_references(v)
|
||||
if not b_ref:
|
||||
raise ValueError(f"{v} is in valid")
|
||||
if int(b_ref[0].start_chapter) != int(chapter_num):
|
||||
raise ValueError('Wrong Chapter')
|
||||
|
||||
|
||||
verse_section = scraps / f"{verses}.html"
|
||||
logger.info('\t\t',verse_section.name, verse_section.exists())
|
||||
if verse_section.exists() is False:
|
||||
missing.append((book_name, chapter_num, verse_section.name))
|
||||
|
||||
if verse_section.name in files_used:
|
||||
raise FileExistsError(f'{verse_section} has been already used.')
|
||||
|
||||
files_used.append(verse_section.name)
|
||||
|
||||
write_file(file, f'<!-- {verse_section} -->')
|
||||
write_file(file, verse_section.read_text())
|
||||
|
||||
|
||||
parser.create_md_file('./mh_scraps/', book_name, file.stem)
|
||||
parser.feed(file.read_text())
|
||||
parser.clean_file()
|
Loading…
Reference in New Issue
Block a user