#! /sed -f # Thu May 18 12:43:45 2000 by tilmann@bitterberg.de # Changed Mon Feb 26 12:34:19 2000 by bonzini@gnu.org (removed # limitations on the number of links) # # Description: # Creates an index of links from a HTML file # Does something similar like lynx -force_html -dump but # leaves the document html (generate an index of links) # # Example: Input # # foo1 Click here foo2 # # # Output: # # foo1 [1] Click here foo2 #
[1] http://link.org
# # # NOTE: # 1) Will break at links like foo2\n397\nPREVIOUS ENTRIES # using newline as separator to the 's' command s ]*>\)\([^\n]*\(\n\)\)\([^\n]*\)\(.*$\) <||||||=\1[\5] \3\5\4\6\4[\5] \2
# |----------1---------||-------3------| |---5---||--6--| # |---2----| |--4-| # Field Contains: # \1 the link text up to the closing > # \2 the link itself (http://foo.com) # \3 the rest of the input line # \4 a newline (\n) # \5 the number we would like to use # \6 everything up to the end of patternspace (i.e. the previous entries) # # Now the line looks like: # foo1 <||||||="blah.html">[1] foo2\n1\n[1] blah.html
h; # save the new entry and the updated counter s/[^\n]*\n// # remove the HTML line from hold space x s/\n.*// # remove what went in hold space from pattern space b loop; # look if there is another link in that line } s/<||||||/
insert index /<\/[Bb][Oo][Dd][Yy]>/{ G # insert saved stuff s/\(<\/[Bb][Oo][Dd][Yy]>[^\n]*\)\n[^\n]*\n*\(.*\)/
\2\1/; }