#!/usr/bin/env python # # Copyright (c) 2006-2007, Hans Meine # All rights reserved. # # This is licensed according to the new BSD license. # Please send patches / comments, I would be happy about any feedback. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # * Neither the name of the University of Hamburg nor the names of its # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. import sys, re, urlparse, urllib, getopt endChars = ' ,;\t<>"\'' url_re = re.compile(r'(((http|https|ftp|gopher|mailto):[^%s]+|(www|ftp)\.[-a-z0-9._]+)[^%s):]+)|href="([^"]+)"' % (endChars, endChars), re.I) def usage(): print "USAGE:", sys.argv[0], "[-b base_url] .." print " extracts URLS from one or more files and dumps them to stdout." print "Options:\n -b: specify base URL for relative-to-absolute link conversion." base= "" options, files = getopt.getopt(sys.argv[1:], "b:") for option, value in options: if option == "-h" or option == "-?": usage() sys.exit(0) elif option == "-b": base = value if len(files) < 1: files = ["-"] def scanFile(f, base = base): for ma in url_re.finditer(f.read()): if ma.group(1): print ma.group(1) else: print urlparse.urljoin(base, ma.group(5)) for filename in files: if filename == "-": scanFile(sys.stdin) elif url_re.match(filename): scanFile(urllib.urlopen(filename), filename) else: scanFile(file(filename))