#! /bin/sed -nf # # Get the title of an HTML document. # Look first for TITLE tag, then fall back on first Heading tag. Otherwise # produce nothing. # # Casper Boden-Cummins <mister_pink@bigfoot.com>, 1998 # /<[Tt][Ii][Tt][Ll][Ee]/ { :title # read up to closing tag /<\/[Tt][Ii][Tt][Ll][Ee]>/ !{ N b title } # strip leading/trailing whitespace # Mario Niebaum <Mario.Niebaum@e-technik.tu-chemnitz.de> s/^.*<[tT][iI][tT][lL][eE]>[ ]*// s/[ ]*<\/[tT][iI][tT][lL][eE]>.*$// # strip HTML tags and print b print } /<[Hh][0-7]/ { :heading # lowercase heading tags s/\(<\/\{0,1\}\)[Hh]\([0-7]\)/\1h\2/g # strip characters up to opening tag G s/<h\([0-7]>\)\(.*\)\(.\)$/\3\1\2/ s/.*\n/<h/ :match_tags /^<h\([0-7]\).*<\/h\1/ !{ N b match_tags } # strip leading/trailing whitespace s/^.*<[Hh][0-7]>[ ]*// s/[ ]*<\/[Hh][0-7]>.*$// # clean up and print b print } # nothing found, so continue to next line b # print: # remove embedded tags and leading/trailing whitespace from # pattern space, print result and quit :print # reduce leading/trailing whitespace around newlines s/[ ]*\n[ ]*/ /g # convert embedded TAGS to whitespace s/[ ]*<\/*[a-zA-Z0-9]\{1,\}>[ ]*/ /g # remove non-printable characters s/[^ -;=?-~]//g # strip leading/trailing whitespace from line s/^[ ]*\(.*[^ ]\)[ ]*$/\1/ # print and quit p q ### colorized by sedsed, a debugger and code formatter for sed scripts ### original script: http://sed.sf.net/grabbag/scripts/get_html_title.sed