Moved the ## Verses: heading to the paragraph section. This is more reliable across the project. Also Added the ability to skip re.sub() for heading sections.

Both Of these issues were caught with Psalms 119
2023-12-02 09:29:00 -05:00 · 2023-12-02 09:29:00 -05:00 · f73b11d153
commit f73b11d153
parent a1e69b1368
1 changed files with 43 additions and 33 deletions
--- a/main.py
+++ b/main.py
@ -9,7 +9,7 @@ from bs4 import BeautifulSoup
 from loguru import logger
 from tqdm import tqdm

-base = Path('./Matthew Henry Commentary/xml').absolute()
+base = Path('./xml/Matthew Henry Commentary').absolute()

 logger.remove(0)
 logger.add('log.txt')
@ -19,6 +19,7 @@ class MyHTMLParser(HTMLParser):
    passage_verse = None
    tag = None
    tag_type = None  # 'start' or 'end'
+    skip_re = False
    attrs = dict()

    # All the patterns are for re.sub(). This is specifically for getting
@ -73,21 +74,26 @@ class MyHTMLParser(HTMLParser):
        self.file = md_file
        return md_file

-    def write_to_file(self, data):
+    def write_to_file(self, data, skip_re=False):

-        if self.file is None:
-            raise ValueError('No File specified')
+        # if self.file is None:
+        #     raise ValueError('No File specified')

-        # Here the list/sublist are searched for and altered for later
-        # processing in self.clean_file()
-        for pattern, sub in self.patterns:
-            # print(f'Running Pattern: {pattern}')
-            data = re.sub(pattern, sub, data)
+        if not skip_re:
+            # Here the list/sublist are searched for and altered for later
+            # processing in self.clean_file()
+            for pattern, sub in self.patterns:
+                # logger.debug(f'Running Pattern: {pattern} on {data[:50]}')
+                data = re.sub(pattern, sub, data)

        # print(f'writing: {data!r}')

-        with open(self.file, 'a+') as file:
-            file.write(data)
+        if self.file:
+            # raise ValueError('No File specified')
+            with open(self.file, 'a+') as file:
+                file.write(data)
+        else:
+            print(data)

    def clean_file(self):
        logger.info(' CLEANING FILE')
@ -114,7 +120,7 @@ class MyHTMLParser(HTMLParser):
                if line.strip().startswith('# '):
                    if header == line.strip():
                        continue
-                    header = line.strip()
+                    header = line

                # Add a `\n` if line is a heading.
                if line.startswith('#'):
@ -215,27 +221,12 @@ class MyHTMLParser(HTMLParser):

            case 'h4':  # this is the section header. Need to keep track of it per file.
                if self.tag_type == 'start':
-                    self.write_to_file('\n# ')
+                    self.write_to_file('\n# ',)
+                    self.skip_re = True

                if self.tag_type == 'end':
                    self.write_to_file('\n')
-
-                    #
-                    if self.passage_verse:
-                        try:
-                            verse = bible.get_references(
-                                self.normalize_osis_verses(
-                                    self.passage_verse)
-                            )[0]
-                        except:
-                            logger.debug(self.passage_verse)
-                            raise
-
-                        self.write_to_file('## Verses: ')
-                        self.write_to_file(
-                            f'{verse.start_verse} - {verse.end_verse}'
-                        )
-                        self.write_to_file('\n\n')
+                    self.skip_re = False

            case 'scripref':  # Scripture ref
                # get attr 'osisref' and parse [..., ('passage', 'Bible:Rev.14.6-Rev.14.7')]
@ -255,6 +246,22 @@ class MyHTMLParser(HTMLParser):
            case 'p':  # Paragraph
                if self.tag_type == 'start':
                    if self.attrs.get('class', False) == 'passage':
+                        if self.passage_verse:
+                            try:
+                                verse = bible.get_references(
+                                    self.normalize_osis_verses(
+                                        self.passage_verse)
+                                )[0]
+                            except:
+                                logger.debug(self.passage_verse)
+                                raise
+
+                            self.write_to_file('## Verses: ')
+                            self.write_to_file(
+                                f'{verse.start_verse} - {verse.end_verse}'
+                            )
+                            self.write_to_file('\n\n')
+
                        # Need this do the regex in self.clean does not pick
                        # this up. Will be also cleaned in the same function.
                        self.write_to_file('Passage: ')
@ -289,6 +296,7 @@ class MyHTMLParser(HTMLParser):
        self.tag = None
        self.tag_type = None
        self.attrs.clear()
+        self.skip_re = False

    def handle_data(self, data):
        # print(f'Data: {data!r}')
@ -301,8 +309,9 @@ class MyHTMLParser(HTMLParser):

        data = data.replace('—', '-- ')
        data = data.replace('GOD', 'God')
+        data = data.replace('\n', ' ')

-        self.write_to_file(data.replace('\n', ' '))
+        self.write_to_file(data, self.skip_re)


 if __name__ == '__main__':
@ -317,12 +326,11 @@ if __name__ == '__main__':
        # Get each book in the volume
        # This will be the main folder for all the book's chapters
        for book in soup.find_all('div1'):
-            logger.info(book['title'])
            book_name = book['title'].replace('First', '1').replace('Second', '2').replace('Third', '3')
+            logger.info(book_name)

            # These are the chapters/files for each book folder
            for chapter in book.find_all('div2'):
-                logger.info(chapter['title'])

                filename = chapter['title']

@ -331,6 +339,7 @@ if __name__ == '__main__':
                    _, roman_num = chapter['title'].split(' ')
                    filename = f'Chapter {parser.roman_to_int(roman_num)}'

+                logger.info(filename)
                parser.create_md_file(base, book_name, filename)

                pbar.set_description(f"Processing: {vol.name}: {book_name}: {filename}")
@ -353,6 +362,7 @@ if __name__ == '__main__':
                for doc in chapter.children:

                    try:
+                        logger.debug(f"{book_name}, {filename}")
                        parser.feed(str(doc))
                    except Exception as e:
                        logger.exception(e)