#! /bin/sed -nf # Join lines if we have tags that span multiple lines :join /<[^>]*$/ { N s/[ *]\n[ *]/ / b join } # Do some selection to speed the thing up /<[ ]*\([aA]\|[iI][mM][gG]\)/ !b # Remove extra spaces before/after the tag name, change img/area to a s/<[ ]*\([aA]\|[iI][mM][gG]|[aA][rR][eE][aA]\)[ ]\+/<a /g # To simplify the regexps that follow, change href/alt to lowercase # and replace whitespace before them with a single space s/<a\([^>]*\)[ ][hH][rR][eE][fF]=/<a\1 href=/g s/<a\([^>]*\)[ ][aA][lL][tT]=/<a\1 alt=/g # To simplify the regexps that follow, quote the arguments to href and alt s/href=\([^" >]\+\)/href="\1"/g s/alt=\([^" >]\+\)/alt="\1"/g # Move the alt tag after href, remove attributes between them s/\( alt="[^"]*"\)[^>]*\( href="[^"]*"\)/\2\1/g # Remove attributes between <a and href s/<a[^>]* href="/<a href="/g # Change href="xxx" ... alt="yyy" to href="xxx|yyy" s/\(<a href="[^"]*\)"[^>]* alt="\([^"]*"\)/\1|\2/g t loop # Print an URL, remove it, and loop :loop h s/.*<a href="\([^"]*\)".*$/\1/p g s/\(.*\)<a href="\([^"]*\)".*$/\1/ t loop ### colorized by sedsed, a debugger and code formatter for sed scripts ### original script: http://sed.sf.net/grabbag/scripts/list_urls.sed