summaryrefslogtreecommitdiff
path: root/tools/html2text.py
diff options
context:
space:
mode:
authorGerald Combs <gerald@wireshark.org>2008-12-17 19:49:18 +0000
committerGerald Combs <gerald@wireshark.org>2008-12-17 19:49:18 +0000
commitf49377e0e7be6ef7d30ff1dfe685b079ef06249f (patch)
treef9301bcd79a6cdc4b5d9ff541e3fddfe0187aba2 /tools/html2text.py
parent79413d1f89648fe292bd0e42391084d7b2f81b6f (diff)
downloadwireshark-f49377e0e7be6ef7d30ff1dfe685b079ef06249f.tar.gz
Update html2text.py to suit our needs. Add spaces in the faq.txt target
so to fix a problem with OS X 10.4. Add html2text.py to the end of the faq.txt target. svn path=/trunk/; revision=27040
Diffstat (limited to 'tools/html2text.py')
-rwxr-xr-xtools/html2text.py110
1 files changed, 83 insertions, 27 deletions
diff --git a/tools/html2text.py b/tools/html2text.py
index 169ab0b894..9ae6c66fb6 100755
--- a/tools/html2text.py
+++ b/tools/html2text.py
@@ -1,34 +1,82 @@
#!/usr/bin/env python
"""html2text: Turn HTML into equivalent Markdown-structured text."""
-__version__ = "2.35"
+__version__ = "2.35-Wireshark"
__author__ = "Aaron Swartz (me@aaronsw.com)"
__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes"]
+# NOTE:
+# This is a modified version of html2text.py from http://www.aaronsw.com/2002/html2text/
+# Changes:
+# Options can now be configured from the command line.
+# SKIP_LINKS and INPUT_ENCODING options have been added.
+# The script now requires Python 2.3
+
# TODO:
# Support decoded entities with unifiable.
# Relative URL resolution
+# Indent sections and lists similar to elinks/links/lynx
if not hasattr(__builtins__, 'True'): True, False = 1, 0
import re, sys, urllib, htmlentitydefs, codecs, StringIO, types
import sgmllib
sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
+from optparse import OptionParser
try: from textwrap import wrap
except: pass
-# Use Unicode characters instead of their ascii psuedo-replacements
-UNICODE_SNOB = 0
-
-# Put the links after each paragraph instead of at the end.
-LINKS_EACH_PARAGRAPH = 0
-
-# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
-BODY_WIDTH = 78
-
-# Don't show internal links (href="#local-anchor") -- corresponding link targets
-# won't be visible in the plain text file anyway.
-SKIP_INTERNAL_LINKS = False
+oparser = OptionParser()
+options = None
+args = None
+
+oparser.add_option(
+ "--force-unicode",
+ action="store_true",
+ dest="UNICODE_SNOB",
+ default=False,
+ help="Use Unicode characters instead of their ascii psuedo-replacements. [default: False]",
+ )
+
+oparser.add_option(
+ "--links-after-paragraphs",
+ action="store_true",
+ dest="LINKS_EACH_PARAGRAPH",
+ default=False,
+ help="Put the links after each paragraph instead of at the end. [default: False]",
+ )
+
+oparser.add_option(
+ "--width",
+ type="int",
+ dest="BODY_WIDTH",
+ default=78,
+ help="Wrap long lines at position. 0 for no wrapping. Requires Python 2.3. [default: 78 characters]",
+ )
+
+oparser.add_option(
+ "--no-internal-links",
+ action="store_true",
+ dest="SKIP_INTERNAL_LINKS",
+ default=False,
+ help='''Don't show internal links (href="#local-anchor"). Corresponding link targets won't be visible in the plain text file anyway. [default: False]''',
+ )
+
+oparser.add_option(
+ "--no-links",
+ action="store_true",
+ dest="SKIP_LINKS",
+ default=False,
+ help='''Don't show links. [default: False]''',
+ )
+
+oparser.add_option(
+ "--input-encoding",
+ type="string",
+ dest="INPUT_ENCODING",
+ default='utf-8',
+ help='''Force the encoding of the input file. [default: utf-8]''',
+ )
### Entity Nonsense ###
@@ -56,18 +104,22 @@ for k in unifiable.keys():
unifiable_n[name2cp(k)] = unifiable[k]
def charref(name):
+ global options
+
if name[0] in ['x','X']:
c = int(name[1:], 16)
else:
c = int(name)
- if not UNICODE_SNOB and c in unifiable_n.keys():
+ if not options.UNICODE_SNOB and c in unifiable_n.keys():
return unifiable_n[c]
else:
return unichr(c)
def entityref(c):
- if not UNICODE_SNOB and c in unifiable.keys():
+ global options
+
+ if not options.UNICODE_SNOB and c in unifiable.keys():
return unifiable[c]
else:
try: name2cp(c)
@@ -103,7 +155,8 @@ def onlywhite(line):
def optwrap(text):
"""Wrap all paragraphs in the provided text."""
- if not BODY_WIDTH:
+ global options
+ if not options.BODY_WIDTH:
return text
assert wrap, "Requires Python 2.3."
@@ -112,7 +165,7 @@ def optwrap(text):
for para in text.split("\n"):
if len(para) > 0:
if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':
- for line in wrap(para, BODY_WIDTH):
+ for line in wrap(para, options.BODY_WIDTH):
result += line + "\n"
result += "\n"
newlines = 2
@@ -156,7 +209,7 @@ class _html2text(sgmllib.SGMLParser):
self.abbr_title = None # current abbreviation definition
self.abbr_data = None # last inner HTML (for abbr being defined)
self.abbr_list = {} # stack of abbreviations to write later
-
+
def outtextf(self, s):
self.outtext += s
@@ -204,6 +257,7 @@ class _html2text(sgmllib.SGMLParser):
if match: return i
def handle_tag(self, tag, attrs, start):
+ global options
attrs = fixattrs(attrs)
if hn(tag):
@@ -258,7 +312,7 @@ class _html2text(sgmllib.SGMLParser):
attrsD = {}
for (x, y) in attrs: attrsD[x] = y
attrs = attrsD
- if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')):
+ if attrs.has_key('href') and not (options.SKIP_LINKS or (options.SKIP_INTERNAL_LINKS and attrs['href'].startswith('#'))):
self.astack.append(attrs)
self.o("[")
else:
@@ -381,7 +435,7 @@ class _html2text(sgmllib.SGMLParser):
if not self.lastWasNL: self.out(' ')
self.space = 0
- if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
+ if self.a and ((self.p_p == 2 and options.LINKS_EACH_PARAGRAPH) or force == "end"):
if force == "end": self.out("\n")
newa = []
@@ -415,6 +469,10 @@ class _html2text(sgmllib.SGMLParser):
def wrapwrite(text): sys.stdout.write(text.encode('utf8'))
def html2text_file(html, out=wrapwrite):
+ global options, args, oparser
+ if options is None or args is None:
+ (options, args) = oparser.parse_args(None, None)
+
h = _html2text(out)
h.feed(html)
h.feed("")
@@ -424,8 +482,9 @@ def html2text(html):
return optwrap(html2text_file(html, None))
if __name__ == "__main__":
- if sys.argv[1:]:
- arg = sys.argv[1]
+ (options, args) = oparser.parse_args()
+ if len(args) > 0:
+ arg = args[0]
if arg.startswith('http://'):
j = urllib.urlopen(arg)
try:
@@ -438,11 +497,8 @@ if __name__ == "__main__":
data = text.decode(encoding)
else:
- encoding = 'utf8'
- if len(sys.argv) > 2:
- encoding = sys.argv[2]
- data = open(arg, 'r').read().decode(encoding)
+ data = open(arg, 'r').read().decode(options.INPUT_ENCODING)
else:
- data = sys.stdin.read().decode('utf8')
+ data = sys.stdin.read().decode(options.INPUT_ENCODING)
wrapwrite(html2text(data))