Skip to content

Commit

Permalink
Make htmlStripper.py and html_table_parser examples use PEP-8 names, …
Browse files Browse the repository at this point in the history
…add comments, handle tags inside quoted strings
  • Loading branch information
ptmcg committed Jun 2, 2023
1 parent 7d4da80 commit 801863a
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 57 deletions.
42 changes: 0 additions & 42 deletions examples/htmlStripper.py

This file was deleted.

58 changes: 58 additions & 0 deletions examples/html_stripper.py
@@ -0,0 +1,58 @@
#
# html_stripper.py
#
# Sample code for stripping HTML markup tags and scripts from
# HTML source files.
#
# Copyright (c) 2006, 2016, 2023, Paul McGuire
#
from urllib.request import urlopen
from pyparsing import (
LineEnd,
quoted_string,
make_html_tags,
common_html_entity,
replace_html_entity,
html_comment,
any_open_tag,
any_close_tag,
replace_with,
)

# if <script> tags found, remove script content also
script_open, script_close = make_html_tags("script")
script_body = script_open + ... + script_close

# translate HTML entities
common_html_entity.set_parse_action(replace_html_entity)

stripper = (
# parse quoted strings first, if they enclose HTML tags - keep these
quoted_string
# parse and translate HTML entities (&amp;, &lt;, &gt;, etc.)
| common_html_entity
# expressions to be stripped - suppress() will remove them when transforming
| (
html_comment | script_body | any_open_tag | any_close_tag
).suppress()
)

repeated_newlines = LineEnd()[2, ...]
repeated_newlines.set_parse_action(replace_with("\n\n"))


if __name__ == '__main__':
# get some HTML
target_url = "https://wiki.python.org/moin/PythonDecoratorLibrary"
with urlopen(target_url) as targetPage:
target_html = targetPage.read().decode("UTF-8")

# first pass, strip out tags and translate entities
# (use transform_string() instead of parse_string - will do
# suppressions and parse actions)
first_pass = stripper.transform_string(target_html)

# first pass leaves many blank lines, collapse these down
second_pass = repeated_newlines.transform_string(first_pass)

print(second_pass)
34 changes: 19 additions & 15 deletions examples/htmlTableParser.py → examples/html_table_parser.py
Expand Up @@ -11,16 +11,16 @@


# define basic HTML tags, and compose into a Table
table, table_end = pp.makeHTMLTags("table")
thead, thead_end = pp.makeHTMLTags("thead")
tbody, tbody_end = pp.makeHTMLTags("tbody")
tr, tr_end = pp.makeHTMLTags("tr")
th, th_end = pp.makeHTMLTags("th")
td, td_end = pp.makeHTMLTags("td")
a, a_end = pp.makeHTMLTags("a")
table, table_end = pp.make_html_tags("table")
thead, thead_end = pp.make_html_tags("thead")
tbody, tbody_end = pp.make_html_tags("tbody")
tr, tr_end = pp.make_html_tags("tr")
th, th_end = pp.make_html_tags("th")
td, td_end = pp.make_html_tags("td")
a, a_end = pp.make_html_tags("a")

# method to strip HTML tags from a string - will be used to clean up content of table cells
strip_html = (pp.anyOpenTag | pp.anyCloseTag).suppress().transformString
strip_html = (pp.any_open_tag | pp.any_close_tag).suppress().transform_string

# expression for parsing <a href="url">text</a> links, returning a (text, url) tuple
link = pp.Group(a + a.tag_body("text") + a_end.suppress())
Expand All @@ -32,13 +32,14 @@ def extract_text_and_url(t):

link.addParseAction(extract_text_and_url)


# method to create table rows of header and data tags
def table_row(start_tag, end_tag):
body = start_tag.tag_body
body.addParseAction(pp.tokenMap(str.strip), pp.tokenMap(strip_html))
body.add_parse_action(pp.token_map(str.strip), pp.token_map(strip_html))
row = pp.Group(
tr.suppress()
+ pp.ZeroOrMore(start_tag.suppress() + body + end_tag.suppress())
+ (start_tag.suppress() + body + end_tag.suppress())[...]
+ tr_end.suppress()
)
return row
Expand All @@ -51,8 +52,8 @@ def table_row(start_tag, end_tag):
html_table = (
table
+ tbody
+ pp.Optional(th_row("headers"))
+ pp.ZeroOrMore(td_row)("rows")
+ th_row[...]("headers")
+ td_row[...]("rows")
+ tbody_end
+ table_end
)
Expand All @@ -67,11 +68,14 @@ def table_row(start_tag, end_tag):
tz_table = html_table.searchString(page_html)[0]

# convert rows to dicts
rows = [dict(zip(tz_table.headers, row)) for row in tz_table.rows]
rows = [dict(zip(tz_table.headers[0], row)) for row in tz_table.rows]

# make a dict keyed by TZ database name
tz_db = {row["TZ database name"]: row for row in rows}
# make a dict keyed by TZ database identifier
# (get identifier key from second column header)
identifier_key = tz_table.headers[0][1]
tz_db = {row[identifier_key]: row for row in rows}

from pprint import pprint

pprint(tz_db["America/Chicago"])
pprint(tz_db["Zulu"])

0 comments on commit 801863a

Please sign in to comment.