#! /bin/sed -nf
# Join lines if we have tags that span multiple lines
:join
/<[^>]*$/ {
N
s/[ *]\n[ *]/ /
b join
}
# Do some selection to speed the thing up
/<[ ]*\([aA]\|[iI][mM][gG]\)/ !b
# Remove extra spaces before/after the tag name, change img/area to a
s/<[ ]*\([aA]\|[iI][mM][gG]|[aA][rR][eE][aA]\)[ ]\+/<a /g
# To simplify the regexps that follow, change href/alt to lowercase
# and replace whitespace before them with a single space
s/<a\([^>]*\)[ ][hH][rR][eE][fF]=/<a\1 href=/g
s/<a\([^>]*\)[ ][aA][lL][tT]=/<a\1 alt=/g
# To simplify the regexps that follow, quote the arguments to href and alt
s/href=\([^" >]\+\)/href="\1"/g
s/alt=\([^" >]\+\)/alt="\1"/g
# Move the alt tag after href, remove attributes between them
s/\( alt="[^"]*"\)[^>]*\( href="[^"]*"\)/\2\1/g
# Remove attributes between <a and href
s/<a[^>]* href="/<a href="/g
# Change href="xxx" ... alt="yyy" to href="xxx|yyy"
s/\(<a href="[^"]*\)"[^>]* alt="\([^"]*"\)/\1|\2/g
t loop
# Print an URL, remove it, and loop
:loop
h
s/.*<a href="\([^"]*\)".*$/\1/p
g
s/\(.*\)<a href="\([^"]*\)".*$/\1/
t loop
### colorized by sedsed, a debugger and code formatter for sed scripts
### original script: http://sed.sf.net/grabbag/scripts/list_urls.sed