Make htmlStripper.py and html_table_parser examples use PEP-8 names, …

…add comments, handle tags inside quoted strings
pyparsing · Jun 2, 2023 · 801863a · 801863a
1 parent 7d4da80
commit 801863a
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 57 deletions.
diff --git a/examples/htmlStripper.py b/examples/htmlStripper.py
diff --git a/examples/html_stripper.py b/examples/html_stripper.py
@@ -0,0 +1,58 @@
+#
+# html_stripper.py
+#
+#  Sample code for stripping HTML markup tags and scripts from
+#  HTML source files.
+#
+# Copyright (c) 2006, 2016, 2023, Paul McGuire
+#
+from urllib.request import urlopen
+from pyparsing import (
+    LineEnd,
+    quoted_string,
+    make_html_tags,
+    common_html_entity,
+    replace_html_entity,
+    html_comment,
+    any_open_tag,
+    any_close_tag,
+    replace_with,
+)
+
+# if <script> tags found, remove script content also
+script_open, script_close = make_html_tags("script")
+script_body = script_open + ... + script_close
+
+# translate HTML entities
+common_html_entity.set_parse_action(replace_html_entity)
+
+stripper = (
+        # parse quoted strings first, if they enclose HTML tags - keep these
+        quoted_string
+        # parse and translate HTML entities (&amp;, &lt;, &gt;, etc.)
+        | common_html_entity
+        # expressions to be stripped - suppress() will remove them when transforming
+        | (
+            html_comment | script_body | any_open_tag | any_close_tag
+          ).suppress()
+)
+
+repeated_newlines = LineEnd()[2, ...]
+repeated_newlines.set_parse_action(replace_with("\n\n"))
+
+
+if __name__ == '__main__':
+    # get some HTML
+    target_url = "https://wiki.python.org/moin/PythonDecoratorLibrary"
+    with urlopen(target_url) as targetPage:
+        target_html = targetPage.read().decode("UTF-8")
+
+    # first pass, strip out tags and translate entities
+    # (use transform_string() instead of parse_string - will do
+    # suppressions and parse actions)
+    first_pass = stripper.transform_string(target_html)
+
+    # first pass leaves many blank lines, collapse these down
+    second_pass = repeated_newlines.transform_string(first_pass)
+
+    print(second_pass)
diff --git a/examples/htmlTableParser.py → examples/html_table_parser.py b/examples/htmlTableParser.py → examples/html_table_parser.py
@@ -11,16 +11,16 @@
 
 
 # define basic HTML tags, and compose into a Table
-table, table_end = pp.makeHTMLTags("table")
-thead, thead_end = pp.makeHTMLTags("thead")
-tbody, tbody_end = pp.makeHTMLTags("tbody")
-tr, tr_end = pp.makeHTMLTags("tr")
-th, th_end = pp.makeHTMLTags("th")
-td, td_end = pp.makeHTMLTags("td")
-a, a_end = pp.makeHTMLTags("a")
+table, table_end = pp.make_html_tags("table")
+thead, thead_end = pp.make_html_tags("thead")
+tbody, tbody_end = pp.make_html_tags("tbody")
+tr, tr_end = pp.make_html_tags("tr")
+th, th_end = pp.make_html_tags("th")
+td, td_end = pp.make_html_tags("td")
+a, a_end = pp.make_html_tags("a")
 
 # method to strip HTML tags from a string - will be used to clean up content of table cells
-strip_html = (pp.anyOpenTag | pp.anyCloseTag).suppress().transformString
+strip_html = (pp.any_open_tag | pp.any_close_tag).suppress().transform_string
 
 # expression for parsing <a href="url">text</a> links, returning a (text, url) tuple
 link = pp.Group(a + a.tag_body("text") + a_end.suppress())
@@ -32,13 +32,14 @@ def extract_text_and_url(t):
 
 link.addParseAction(extract_text_and_url)
 
+
 # method to create table rows of header and data tags
 def table_row(start_tag, end_tag):
     body = start_tag.tag_body
-    body.addParseAction(pp.tokenMap(str.strip), pp.tokenMap(strip_html))
+    body.add_parse_action(pp.token_map(str.strip), pp.token_map(strip_html))
     row = pp.Group(
         tr.suppress()
-        + pp.ZeroOrMore(start_tag.suppress() + body + end_tag.suppress())
+        + (start_tag.suppress() + body + end_tag.suppress())[...]
         + tr_end.suppress()
     )
     return row
@@ -51,8 +52,8 @@ def table_row(start_tag, end_tag):
 html_table = (
     table
     + tbody
-    + pp.Optional(th_row("headers"))
-    + pp.ZeroOrMore(td_row)("rows")
+    + th_row[...]("headers")
+    + td_row[...]("rows")
     + tbody_end
     + table_end
 )
@@ -67,11 +68,14 @@ def table_row(start_tag, end_tag):
 tz_table = html_table.searchString(page_html)[0]
 
 # convert rows to dicts
-rows = [dict(zip(tz_table.headers, row)) for row in tz_table.rows]
+rows = [dict(zip(tz_table.headers[0], row)) for row in tz_table.rows]
 
-# make a dict keyed by TZ database name
-tz_db = {row["TZ database name"]: row for row in rows}
+# make a dict keyed by TZ database identifier
+# (get identifier key from second column header)
+identifier_key = tz_table.headers[0][1]
+tz_db = {row[identifier_key]: row for row in rows}
 
 from pprint import pprint
 
 pprint(tz_db["America/Chicago"])
+pprint(tz_db["Zulu"])