#! /bin/sed -nf
#
# Get the title of an HTML document.
# Look first for TITLE tag, then fall back on first Heading tag. Otherwise
# produce nothing.
#
# Casper Boden-Cummins <mister_pink@bigfoot.com>, 1998
#
/<[Tt][Ii][Tt][Ll][Ee]/ {
:title
# read up to closing tag
/<\/[Tt][Ii][Tt][Ll][Ee]>/ !{
N
b title
}
# strip leading/trailing whitespace
# Mario Niebaum <Mario.Niebaum@e-technik.tu-chemnitz.de>
s/^.*<[tT][iI][tT][lL][eE]>[ ]*//
s/[ ]*<\/[tT][iI][tT][lL][eE]>.*$//
# strip HTML tags and print
b print
}
/<[Hh][0-7]/ {
:heading
# lowercase heading tags
s/\(<\/\{0,1\}\)[Hh]\([0-7]\)/\1h\2/g
# strip characters up to opening tag
G
s/<h\([0-7]>\)\(.*\)\(.\)$/\3\1\2/
s/.*\n/<h/
:match_tags
/^<h\([0-7]\).*<\/h\1/ !{
N
b match_tags
}
# strip leading/trailing whitespace
s/^.*<[Hh][0-7]>[ ]*//
s/[ ]*<\/[Hh][0-7]>.*$//
# clean up and print
b print
}
# nothing found, so continue to next line
b
# print:
# remove embedded tags and leading/trailing whitespace from
# pattern space, print result and quit
:print
# reduce leading/trailing whitespace around newlines
s/[ ]*\n[ ]*/ /g
# convert embedded TAGS to whitespace
s/[ ]*<\/*[a-zA-Z0-9]\{1,\}>[ ]*/ /g
# remove non-printable characters
s/[^ -;=?-~]//g
# strip leading/trailing whitespace from line
s/^[ ]*\(.*[^ ]\)[ ]*$/\1/
# print and quit
p
q
### colorized by sedsed, a debugger and code formatter for sed scripts
### original script: http://sed.sf.net/grabbag/scripts/get_html_title.sed