climatology/clim/wls.py - incubator-sdap-nexus - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #!/usr/bin/env python
 #-----------------------------------------------------------------------------
 # Name:        filelist.py
 # Purpose:     File listing class/functions.
 #
 # Author:      Brian Wilson
 #
 # Created:     Mon Apr 10 11:01:06 2006
 #-----------------------------------------------------------------------------
 #
 USAGE = """
 filelist.py [--help] [--bottomUp] [--directory] [--delete]
             [--fetchDir <outputDir>] [--fetchWitSubDirs]
             [--list] [--matchUrl] --quiet] [--regex '.*\.[cC]']
             [--size] [--topOnly] [--url]
             [--wildcard '*.txt.*'] [--xml]  <topPaths ...>

 Recursively traverse and print (with full paths or URL's) all files
 under the topPath(s) that match ANY of one or more regular expressions
 and/or wildcard glob) strings.  By default, it simply prints the matches,
 but one can also get their sizes, fetch them, or delete them.

 The topPaths can be a mixture of local and remote (ftp or http)
 paths, in which case a list of URL's is returned.  If xml mode is
 turned on, then the output is an XML list.

 If no regex or wildcard patterns are specified, then ALL files
 are returned.  If files are fetched, then the URL's are
 REWRITTEN to point to the local copies.

 """
 # See the bottom of the file for exact switches and example of use.

 import sys, os, re, string, getopt, types, getpass
 import urllib, urllib2, urlparse, time, shutil, socket, stat
 from fnmatch import fnmatchcase
 from ftplib import FTP
 #import dataenc

 def matchAnyThenConstrain(root, name, haveRegs, regs, haveWilds, wildCards,
                           constraintFunction):
     """Return True if the file name matches any of the compiled regular
     expressions or any of the wildcard (glob) specs, and (if present) the
     constraintFunction returns True.  The regex can be a pair of match &
     substitution patterns.  The 'name' of the file might be altered by a
     regex substitution and/or the constraintFunction.
     """
     if not haveRegs and not haveWilds:
         if constraintFunction is not None:
             return constraintFunction(root, name)
         else:
             return (True, name)
     else:
         match = False
         if haveRegs:
             for reg in regs:
                 pattern, subst = reg
                 if pattern.search(name):
                     match = True
                     if subst:
                         name = pattern.sub(subst, name)
                     break
         if haveWilds and not match:
             for wild in wildCards:
                 if fnmatchcase(name, wild):
                     match = True
                     break
         if match and constraintFunction is not None:
             match, name = constraintFunction(root, name)
         return (match, name)


 # Users call this function
 def filelist(urlPaths, regSpecs=[], wildCards=[], needCredentials=False, userCredentials=None,
              matchFunction=matchAnyThenConstrain, constraintFunction=None,
              matchUrl=False, walkDirectories=True,
              urlMode=True, xmlMode=True, quietMode=False, verboseMode=False, getFileInfo=False,
              fetchDir=None, fetchIfNewer=False, fetchWithSubDirs=False,
              directoryMode=False, listMode=False, deleteMode=False, topDown=True,
              stream=sys.stdout):
     """Recursively traverse and print (with full paths or URL's) all files
     under the topPath(s) that match one or more regular expressions and/or
     wildcard (glob) strings, and an optional constraint (T/F) function to
     further winnow the candidate matches.  (The matchFunction can also be
     entirely replaced with custom logic.)

     By default, it simply generates the matches, but one can also fetch them,
     get their sizes, or delete them (if they are local files).
     Handles local directory paths and ftp/http URL's.

     Returns three file lists: matched, actually fetched, & destination names.
     """
     try:
         matchedFiles = []       # source files that match criteria
         fetchedFiles = []       # files that were actually fetched this run
         destinationFiles = []   # destination (local) file names (rewritten URL)

         topPaths = []
         for url in urlPaths:
             if url == '' or url == None: continue
             remote, protocol, netloc, path = remoteUrl(url)
             if not remote: url = os.path.abspath(url)
             if url[-1] == '/': url = url[:-1]
             topPaths.append(url)

         if needCredentials and userCredentials is None:
             userCredentials = promptForCredentials(topPaths)

         if fetchDir:
             workDir = os.path.join(fetchDir, '.tmp')
             # fetch into tmp directory & then rename so fetching is atomic
             try: os.mkdir(workDir)
             except: pass
             if not os.path.exists(workDir):
                 die("filelist: Cannot write to fetch directory %s" % fetchDir)

         if isinstance(topPaths, types.StringType): topPaths = [topPaths]
         regSpecs = [s for s in regSpecs if s != '' and s != None]
         wildCards = [s for s in wildCards if s != '' and s != None]

         haveRegs = False; regs = []; haveWilds = False; haveMatchFunction = False
         if len(regSpecs) > 0:
             haveRegs = True
             regs = []
             for reg in regSpecs:
                 (pattern, subst) = parse_re_with_subst(reg)
                 regs.append( (re.compile(pattern), subst) )
         if len(wildCards) > 0:
             haveWilds = True

         prefix = ''
         extra = ''
         suffix = ''
         if deleteMode:
             suffix += ' deleted.'
             if '.' in topPaths:
                 die("filelist: Recursively deleting from the dot (.) path is not safe.  Shame.")

         if directoryMode: listMode = False
         if listMode: getFileInfo = True
         if quietMode: stream = None
         sumSizes = 0
         if xmlMode:
             matchedFiles.append('<files>')
             fetchedFiles.append('<files>')
             _output('<files>', destinationFiles, stream)
             prefix += '  <file>'
             suffix += '</file>'

         for top in topPaths:
             if verboseMode: warn('filelist: searching', top)
             topMatchCount = 0; topFetchCount = 0

             for root, dirs, files, infos in walk(top, userCredentials, walkDirectories, topDown):
                 if verboseMode: warn('filelist: found files in', root)
                 remote, protocol, netloc, path = remoteUrl(root)
                 if directoryMode:
                     contents = dirs
                 else:
                     contents = files

                 for i in range(len(contents)):
                     line = ''
                     file = contents[i]
                     try:
                         info = infos[i]
                     except:
                         info = None
                     if matchUrl:
                         name = os.path.join(root, file)
                     else:
                         name = file

                     match, newname = matchFunction(root, name, haveRegs, regs,
                                                    haveWilds, wildCards, constraintFunction)
                     if match:
                         line = ''
                         topMatchCount += 1
                         fn = os.path.join(root, file)

                         if getFileInfo or (fetchIfNewer and not remote):
                             if remote:
                                 if info and getFileInfo:
                                     if listMode: line = info.line
                                     extra = ' ' + str(info.size) + ' ' + str(info.modTime)
                                     sumSizes += info.size
                             else:
                                 st = os.stat(fn)
                                 line = ' '.join( map(str, \
                                         (st.st_mode, st.st_uid, st.st_gid, st.st_size, st.st_mtime, fn)))
                                 info = FileInfo(line, st.st_size, st.st_mtime, st.st_uid, st.st_gid, st.st_mode)
                                 if getFileInfo:
                                     extra = ' ' + str(info.size) + ' ' + str(info.modTime)
                                     sumSizes += info.size

                         if not remote and urlMode: fn = makeFileUrl(fn)
                         matchedFiles.append(prefix + fn + extra + suffix)

                         if matchUrl:
                             newfn = newname
                         else:
                             newfn = os.path.join(root, newname)
                         newr, newp, newloc, newpath = remoteUrl(newfn)
                         newfile = os.path.split(newpath)[1]

                         if fetchDir:
                             if fetchDir == '.': fetchDir = os.getcwd()
                             if fetchWithSubDirs:
                                 destDir = os.path.join(fetchDir, newpath[1:])
                             else:
                                 destDir = fetchDir
                                 destFile = os.path.join(destDir, newfile)
                                 tmpFile = os.path.join(workDir, newfile)

                             if shouldFetch(remote, destFile, fetchIfNewer, info):
                                 if not quietMode:
                                     warn('filelist: Fetching ', fn)
                                     warn('filelist: Writing  ', destFile)
                                 try:
                                     os.makedirs(destDir)
                                 except:
                                     # kludge, makedirs throws exception if any part of path exists
                                     pass
                                 if remote:
                                     urllib.urlretrieve(fn, tmpFile)
                                 else:
                                     shutil.copyfile(fn, tmpFile)
                                 os.rename(tmpFile, destFile)   # atomic rename of file into destDir

                                 topFetchCount += 1
                                 fetchedFiles.append(prefix + fn + suffix)
                                 if getFileInfo: line = line + ' ' + destFile

                                 # now rewrite URL to point to local copy of file
                                 fn = destFile
                                 if not remote and urlMode: fn = makeFileUrl(fn)

                         if not listMode:
                             line = prefix + fn + extra + suffix
                         _output(line, destinationFiles, stream)
                         if deleteMode:
                             if remote:
                                 die('filelist: Cannot delete remote files (yet)')
                             else:
                                 os.unlink(fn)

             if verboseMode and fetchDir:
                 warn('filelist: Matched %d files from %s' % (topMatchCount, top))
                 warn('filelist: Fetched %d files from %s' % (topFetchCount, top))
         if fetchDir:
             for f in os.listdir(workDir): os.remove(os.path.join(workDir, f))
             os.rmdir(workDir)

         if xmlMode:
             matchedFiles.append('</files>')
             fetchedFiles.append('</files>')
             _output('<files>', destinationFiles, stream)

         if getFileInfo:
             if xmlMode:
                 line = '<totalSize>%s</totalSize>' % sumSizes
             else:
                 line = '#filelist: total size %s' % sumSizes
             matchedFiles.append(line)
             _output(line, destinationFiles, stream)

     except KeyboardInterrupt:
         if fetchDir:
             for f in os.listdir(workDir): os.remove(os.path.join(workDir, f))
             os.rmdir(workDir)
         die('filelist: Keyboard Interrupt')

     return (matchedFiles, fetchedFiles, destinationFiles)


 def shouldFetch(remote, destFile, fetchIfNewer, srcFileInfo):
     if remote:
         if os.path.exists(destFile):
             doFetch = False
         else:
             doFetch = True
     else:
         if os.path.exists(destFile):
             if fetchIfNewer:
                 destModTime = os.path.getmtime(destFile)
                 if destModTime < srcFileInfo.modTime:
                     doFetch = True
                 else:
                     doFetch = False
             else:
                 doFetch = False
         else:
             doFetch = True
     return doFetch

 def _output(line, lines, stream=None):
     """Internal function: Add line to output lines and optionally print to stream."""
     lines.append(line)
     if stream: print >>stream, line

 class FileInfo:
     """Holder class for those file info. elements that are consistent among local
     files (output of stat), ftp directories, http, etc.  Minimum useful fields are
     modification time and size.  Line contains usual string output of ls -l.
     """
     def __init__(self, line, size, modTime, userId=None, groupId=None, protectMode=None):
         self.line=line; self.size=size; self.modTime=modTime
         self.userId=userId; self.groupId=groupId; self.protectMode=protectMode

 class UserCredential(object):
     """Container for user credential info. like username, password, certificate, etc.
     """
     def __init__(self, username=None, password=None, validInterval=None, certificate=None):
         self.username = username
         self.password = password
         self.validInterval = validInterval     # tuple of Ints (days, hours, minutes)
         if password is not None and validInterval is None:
             die('UserCredential: If password is present, validInterval is also required.')
         self.certificate = certificate

     def getPassword(self):
         pw = self._password
         if pw:
             pw, daynumber, timestamp = dataenc.pass_dec(pw)
             if dataenc.unexpired(daynumber, timestamp, self.validInterval):
                 return pw
             else:
                 return None
         else:
             return None
     def setPassword(self, pw):
         if pw and pw != '':
             self._password = dataenc.pass_enc(pw, daynumber=True, timestamp=True)
         else:
             self._password = pw
     password = property(getPassword, setPassword)

 class UserCredentials:
     """Contains dictionary of (url, credential) pairs and optionally an httpProxy.
     """
     def __init__(self, httpProxy=None, credentials={}):
         self.httpProxy = httpProxy
         self.credentials = credentials
     def add(self, url, credential):
         self.credentials[url] = credential; return self
     def forUrl(self, url):
         for key in self.credentials:
             if url.startswith(key):
                 return self.credentials[key]
         return None

 def promptForCredentials(urls, httpProxy=None):
     if httpProxy == None:
         httpProxy = raw_input('Enter HTTP proxy [none]: ')
         if httpProxy == '': httpProxy = None
     credentials = UserCredentials(httpProxy)
     localUserName = getpass.getuser()
     for url in urls:
         remote, protocol, netloc, path = remoteUrl(url)
         if remote:
             username, password, validInterval = promptForCredential(url, localUserName)
             credential = UserCredential(username, password, validInterval)
             credentials.add(url, credential)
     return credentials

 def promptForCredential(url, localUserName):
     remote, protocol, netloc, path = remoteUrl(url)
     if protocol == 'ftp':
         defaultUserName = 'anonymous'
     else:
         defaultUserName = localUserName
     username = raw_input('Need credentials for URL %s\nUsername [%s]: ' \
                          % (url, defaultUserName))
     if username == '': username = defaultUserName
     password = ''
     while password == '':
         password = getpass.getpass()
     validInterval = [0, 1, 0]
     if password != '':
         response = raw_input('Enter valid time period for credential [(days, hours, minutes) = 0 1 0]: ')
         if response != '':
             validInterval = response.split()
     return (username, password, validInterval)

 class DirectoryWalker:
     """Recursively walk directories using the protocol specified in a URL.
     Sublclasses handle ftp, http, sftp, local file system, etc.
     """
     def __init__(self, userCredentials=None, retries=3, sleepTime=5):
         self.userCredentials = userCredentials
         self.retries = retries
         self.sleepTime = sleepTime

     def walk(self, top, walkDirectories=True):
         """Recursively walk directories on a remote site to retrieve file lists.
         """
         remote, protocol, netloc, path = remoteUrl(top)
         status, dir_listing = self.retrieveDirList(top)
         if status:
             if len(dir_listing) == 0:
                 yield (top, [], [], [])
             else:
                 (dirs, files, infos) = self.parseDirList(dir_listing, path)
                 yield (top, dirs, files, infos)

                 if walkDirectories:
                     for dir in dirs:
                         # Depth-first recursion
                         for root, dirs, files, infos in self.walk(top + '/' + dir, walkDirectories):
                             yield (root, dirs, files, infos)
         else:
             warn('DirectoryWalker: error, unable to retrieve directory listing at', top)
             yield (top, [], [], [])

     def retrieveDirList(self, url):
         """Retrieve directory listing as a list of text lines.  Returns (status, dirList)."""
         pass
     def parseDirList(self, dirList, path=None):
         """Parse directory listing (text) and return three lists (dirs, files, fileInfos)."""
         pass

 class FtpDirectoryWalker(DirectoryWalker):
     """Recursively walk directories on an ftp site."""
     def __init__(self, userCredentials=None, retries=3, sleepTime=5):
         DirectoryWalker.__init__(self, userCredentials, retries, sleepTime)

     def retrieveDirList(self, url):
         """Retrieve a directory listing via ftp with retries.
         """
         remote, protocol, netloc, path = remoteUrl(url)
         credential = None
         if self.userCredentials:
             credential = self.userCredentials.forUrl(url)
         dir = ''; dir_list = []
         ftp = FTP()
         for i in range(self.retries):
             try:
                 ftp.connect(netloc)
                 if credential is None or \
                    credential.username == 'anonymous' or \
                    credential.username == '':
                     ftp.login()
                 else:
                     ftp.login(credential.username, credential.password)
                 ftp.cwd(path)
                 ftp.retrlines('LIST', dir_list.append)
                 ftp.quit()
                 dir = '\n'.join(dir_list)
                 return (True, dir)
             except:
                 pass
             time.sleep(self.sleepTime)
             warn('FtpDirectoryWalker: connect retry to ', netloc, path)
         return (False, dir)

     def parseDirList(self, dir, path=None):
         """Parse long directory listing returned by ftp or (ls -l).
         Separate entries into directories and files.
         """
         dirs = []; files = []; infos = []
         for entry in dir.split('\n'):
             fields = entry.split()
             if len(fields) < 7: continue
             fn = fields[-1]
             if fn == '.' or fn == '..': continue
             if re.match('^d', fields[0])and fields[0][7] == 'r':
                 dirs.append(fn)
             else:
                 files.append(fn)
                 info = FileInfo(entry, int(fields[4]), '-'.join(fields[5:8]), \
                                 fields[2], fields[3], fields[0])
                 infos.append(info)
         return (dirs, files, infos)

 class DirListingParser(object):
     """Base class for directory listing parsers."""
     def __init__(self, regex):
         self.regex = regex
         self.compiledRegex = re.compile(self.regex)

     def parse(self, dir, listingHtml):
         """Return (dirs, files, infos)."""
         dirs = []; files = []; infos = []
         raise NotImplementedError, "Override this method in sub class."

 class ApacheDirListingParser(DirListingParser):
     """Parser class for apache."""
     def parse(self, dir, listingHtml):
         dirs = []; files = []; infos = []
         items = self.compiledRegex.findall(listingHtml)
         for item, itemName in items:
             if itemName.strip() == 'Parent Directory': continue
             if isinstance(item, str):
                 name = item
             else:
                 name, dateTime, size = item[:]

             if name.endswith('/'):
                 type = 'd'
                 dirs.append(name[:-1])
             else:
                 type = '-'
                 files.append(name)
             #not doing file info
             '''
             size = size.lower()
             if size.endswith('k'):
                 size = int(size[:-1]) * 1024
             elif size.endswith('m'):
                 size = int(size[:-1]) * 1024 * 1024
             else:
                 size = -1
             line = '%s---------  1 ? ? %15d %s %s' % (type, size, dateTime, name)
             info = FileInfo(line, size, dateTime)
             '''
             infos.append(None)
         return (dirs, files, infos)

 class CDAACDirListingParser(DirListingParser):
     """Parser class for CDAAC data server."""
     def parse(self, dir, listingHtml):
         dirs = []; files = []; infos = []
         items = self.compiledRegex.findall(listingHtml)
         for item, itemName in items:
             if itemName.strip() == 'Parent Directory': continue
             if isinstance(item, str):
                 name = item
             else:
                 name, dateTime, size = item[:]
             if name.endswith('/'):
                 type = 'd'
                 dirs.append(name)
             else:
                 type = '-'
                 files.append(name)
             #not doing file info
             '''
             size = size.lower()
             if size.endswith('k'):
                 size = int(size[:-1]) * 1024
             elif size.endswith('m'):
                 size = int(size[:-1]) * 1024 * 1024
             else:
                 size = -1
             line = '%s---------  1 ? ? %15d %s %s' % (type, size, dateTime, name)
             info = FileInfo(line, size, dateTime)
             '''
             infos.append(None)
         return (dirs, files, infos)

 class HttpDirectoryWalker(DirectoryWalker):
     """Recursively walk directories on an http (web) site to retrieve file lists.
     Handles many styles of HTML directory listings, but still very FRAGILE.
     """

     #list of directory listing parser plugins
     DIR_LIST_REGEX_PLUGINS = [
         #apache 2.0.55 directory listing
         ApacheDirListingParser(r'(?i)alt="\[.*?\]">\s*<A HREF="(?P<name>.*?)">(.*?)</A>'),
         #CDAAC (COSMIC Data)
         CDAACDirListingParser(r'(?i)<LI><A HREF="(?P<name>.*?)">(.*?)</A>'),
         ]

     def __init__(self, userCredentials=None, retries=3, sleepTime=5):
         DirectoryWalker.__init__(self, userCredentials, retries, sleepTime)
         if self.userCredentials:
             if self.userCredentials.httpProxy:
                 os.environ['http_proxy'] = self.userCredentials.httpProxy
                 # global kludge, default proxyHandler looks up proxy there
             passwordMgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
             for url, cred in self.userCredentials.credentials.iteritems():
                 passwordMgr.add_password(None, url, cred.username, cred.password)
             authHandler = urllib2.HTTPBasicAuthHandler(passwordMgr)
             opener = urllib2.build_opener(authHandler)
         else:
 #            opener = urllib2.build_opener()
             opener = None
 #        opener.add_headers = [('User-agent', 'Mozilla/5.0')]
         self.opener = opener

     def retrieveDirList(self, url):
         """Retrieve an HTML directory listing via http with retries.
         """
 ###        url = os.path.join(url, 'contents.html')     ### hack for DAP servers at GES-DISC
         dir_listing = ''
         proxies = {}
         for i in range(self.retries):
             try:
                 if self.opener:
                     response = self.opener.open(url)
                 else:
                     response = urllib.urlopen(url)
             except IOError, e:
                 if hasattr(e, 'reason'):
                     warn('HttpDirectoryWalker: Error, failed to reach server because: %s' % e.reason)
                 elif hasattr(e, 'code'):
                     warn('HttpDirectoryWalker: Server could not fulfill request, error code %s' % e.code)
             else:
                 dir_listing = response.read()
                 return (True, dir_listing)
             time.sleep(self.sleepTime)
             warn('HttpDirectoryWalker: retrying ', url)
         return (False, dir_listing)

     reDirPath = re.compile(r'(?i)<H1>.*?Index of\s*?(\S+?)\s*?</H1>')

     def parseDirList(self, dir, path):
         """Parse fragile HTML directory listings returned by various HTTP servers,
         including Apache and OpenDAP.  Separate entries into directories and files.
         """
         dirs = []; files = []; infos = []
         if path:
             match = HttpDirectoryWalker.reDirPath.search(dir)
             if not match:
                 die('HttpDirectoryWalker: Cannot find directory name %s in HTML listing:\n%s' % (path, dir))
             dirName = match.group(1)
             if dirName not in path:
                 warn('HttpDirectoryWalker: Directory name %s in HTML listing does not agree with path %s:\n%s' % (dirName, path, dir))

         # Try to find directory lines that contain file info
         reDirListWithStat = re.compile( \
             r'(?i)<A HREF=[\'"]*?(?P<name>[^\?].*?' + dirName + r'.*?)[\'"]*?>.*?</A>\s*(?P<dateTime>\S+ \S+)\s+?(?P<size>\S+)\s*?$')
         items = reDirListWithStat.findall(dir)
         # If not, then try to find simple directory lines
         if len(items) == 0:
             reDirList = re.compile( \
                 r'(?i)<A HREF=[\'"]*?(?P<name>[^\?].*?' + dirName + r'.*?)[\'"]*?>.*?</A>')
             items = reDirList.findall(dir)

         if len(items) != 0:
             dateTime = '? ?'; size = ''
             for item in items:
                 if isinstance(item, str):
                     name = item
                 else:
                     name, dateTime, size = item[:]
                 if dirName not in name: continue

                 if name.endswith('/'):
                     type = 'd'
                     dirs.append(name)
                 else:
                     type = '-'
                     files.append(name)
                 size = size.lower()
                 if size.endswith('k'):
                     size = int(size[:-1]) * 1024
                 elif size.endswith('m'):
                     size = int(size[:-1]) * 1024 * 1024
                 else:
                     size = -1
                 line = '%s---------  1 ? ? %15d %s %s' % (type, size, dateTime, name)
                 info = FileInfo(line, size, dateTime)
                 infos.append(info)
                 print line

         #try plugins
         else:
             for plugin in self.DIR_LIST_REGEX_PLUGINS:
                 pluginResults = plugin.parse(dirName, dir)
                 if len(pluginResults[0]) != 0 or len(pluginResults[1]) != 0 or \
                     len(pluginResults[2]) != 0: return pluginResults

         return (dirs, files, infos)


 def walk(top, userCredentials=None, walkDirectories=True, topDown=True):
     """Recursively walk directories to retrieve file lists.
     Returns the topPath, contained subdirectories and files, and
     optionally FileInfo objects (if info is included in protocol results).
     Handles local directory paths and ftp/http protocols (URL's).
     """
     remote, protocol, netloc, path = remoteUrl(top)
     if remote:
         if protocol == 'ftp':
             ftpWalker = FtpDirectoryWalker(userCredentials)
             for root, dirs, files, infos in ftpWalker.walk(top, walkDirectories):
                 yield (root, dirs, files, infos)
         elif protocol == 'http':
 #            import pdb; pdb.set_trace()
             httpWalker = HttpDirectoryWalker(userCredentials)
             for root, dirs, files,infos in httpWalker.walk(top, walkDirectories):
                 yield (root, dirs, files, infos)
         elif protocol == 'sftp':
             sftpWalker = SftpDirectoryWalker(userCredentials)
             for root, dirs, files,infos in sftpWalker.walk(top, walkDirectories):
                 yield (root, dirs, files, infos)
         else:
             die('filelist: Cannot handle protocol ', protocol)
     else:
         if walkDirectories:
             for root, dirs, files in os.walk(top, topDown):
                 yield (root, dirs, files, [])
         else:
             files = os.listdir(top)
             yield (top, [], files, [])

 def remoteUrl(url):
     """Returns True if the URL is remote; also returns protocol,
     net location (host:port), and path."""
     protocol, netloc, path, params, query, fragment = urlparse.urlparse(url)
     if protocol == '':
         return (False, protocol, netloc, path)
     else:
         return (True, protocol, netloc, path)


 # utils
 RE_WITH_SUBST_PATTERN = re.compile(r'^s/(.+)/(.+)/$')
 def parse_re_with_subst(str):
     match = RE_WITH_SUBST_PATTERN.match(str)
     if match:
         return (match.group(1), match.group(2))
     else:
         return (str, None)

 def hostName():
     return socket.gethostbyaddr(socket.gethostname())[0]

 FILE_URL_PREFIX = 'file://' + hostName()
 def makeFileUrl(file):
     return FILE_URL_PREFIX + file

 def warn(*str): sys.stderr.write(' '.join(str) + '\n')
 def die(str, status=1): warn(str); sys.exit(status)

 def main():
     """Main function for outside scripts to call."""

     from sys import argv

     if len(argv) < 2: die(USAGE)
     try:
         opts, argv = getopt.getopt(argv[1:], 'hbcdf:ilqr:stuvw:x',
                          ['help', 'bottomUp', 'credentials', 'delete', 'directory',
                           'fetchDir=', 'fetchIfNewer', 'fetchWithSubDirs', 'info',
                           'list', 'quiet', 'regex=', 'size', 'topOnly',
                           'url', 'verbose', 'wildcard=', 'xml'])
     except getopt.GetoptError, (msg, bad_opt):
         die("%s error: Bad option: %s, %s" % (argv[0], bad_opt, msg))

     regSpecs = []; wildCards = []; matchUrl=False; walkDirectories = True
     needCredentials = False; userCredentials = None
     urlMode=False; xmlMode=False; quietMode=False; verboseMode=False; getFileInfo=False
     fetchDir = None; fetchIfNewer=False; fetchWithSubDirs=False
     directoryMode = False; deleteMode = False; topDown = True; listMode = False

     for opt, val in opts:
         if opt   in ('-h', '--help'):       die(USAGE)
         elif opt in ('-b', '--bottomUp'):   topDown = False
         elif opt in ('-c', '--credentials'):   needCredentials = True
         elif opt in ('-d', '--directory'):  directoryMode=True
         elif opt in ('--delete'):           deleteMode=True
         elif opt in ('-f', '--fetchDir'):   fetchDir = val
                                             # retrieve remote files to this dir
         elif opt in ('--fetchIfNewer'):     fetchIfNewer=True
                                             # only fetch if src file is newer than existing dest file
         elif opt in ('--fetchWithSubDirs'): fetchWithSubDirs=True
                                             # mirror subdirectories when fetching
         elif opt in ('-i', '--info'):       getFileInfo=True
         elif opt in ('-l', '--list'):       listMode=True
         elif opt in ('-m', '--matchUrl'):   matchUrl=True
                                             # regexs match entire URL/path, not just file name
         elif opt in ('-q', '--quiet'):      quietMode=True
                                             # don't print files during walk
         elif opt in ('-r', '--regex'):      regSpecs.append(val)
         elif opt in ('-s', '--size'):       sizeMode=True
         elif opt in ('-t', '--topOnly'):    walkDirectories=False
         elif opt in ('-u', '--url'):        urlMode=True
                                             # return URL's (file:, ftp:, http:, etc.)
         elif opt in ('-v', '--verbose'):    verboseMode=True
         elif opt in ('-w', '--wildcard'):   wildCards.append(val)
         elif opt in ('-x', '--xml'):        xmlMode=True   # return list in XML format
         else: die(USAGE)

 #    import pdb; pdb.set_trace()

     matchedFiles, fetchedFiles, destinationFiles = \
             filelist(argv, regSpecs, wildCards, needCredentials, userCredentials,
                      matchAnyThenConstrain, None, matchUrl, walkDirectories,
                      urlMode, xmlMode, quietMode, verboseMode, getFileInfo,
                      fetchDir, fetchIfNewer, fetchWithSubDirs,
                      directoryMode, listMode, deleteMode, topDown)

     if quietMode:
         if listMode == 'match':
             print matchedFiles
         elif listMode == 'fetch':
             print fetchedFiles
         elif listMode == 'destination':
             print destinationFiles
         else:
             pass


 if __name__ ==  '__main__': main()