Logo Search packages:      
Sourcecode: wapiti version File versions  Download package

lswww.py

#!/usr/bin/env python

# lswww v2.1.5 - A web spider library
# Copyright (C) 2006 Nicolas Surribas
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

import sys, re, socket, getopt, os
import HTMLParser,urllib,urllib2

try:
      import cookielib
except ImportError:
      cookielibhere=0
else:
      cookielibhere=1

try:
      import tidy
except ImportError:
      print "lswww will be far less effective without tidy"
      print "please install libtidy ( http://tidy.sourceforge.net/ ),"
      print "ctypes ( http://starship.python.net/crew/theller/ctypes/ )"
      print "and uTidylib ( http://utidylib.berlios.de/ )"
      tidyhere=0
else:
      tidyhere=1

try:
      import BeautifulSoup
except ImportError:
      BeautifulSouphere=0
else:
      BeautifulSouphere=1

00048 class lswww:
      """
      lswww explore a website and extract links and forms fields.

Usage: python lswww.py http://server.com/base/url/ [options]

Supported options are:
-s <url>
--start <url>
      To specify an url to start with

-x <url>
--exclude <url>
      To exclude an url from the scan (for example logout scripts)
      You can also use a wildcard (*)
      Exemple : -x "http://server/base/?page=*&module=test"
      or -x http://server/base/admin/* to exclude a directory

-p <url_proxy>
--proxy <url_proxy>
      To specify a proxy
      Exemple: -p http://proxy:port/

-c <cookie_file>
--cookie <cookie_file>
      To use a cookie

-a <login%password>
--auth <login%password>
      Set credentials for HTTP authentication
      Doesn't work with Python 2.4

-r <parameter_name>
--remove <parameter_name>
      Remove a parameter from URLs

-v <level>
--verbose <level>
      Set verbosity level
      0: only print results
      1: print a dot for each url found (default)
      2: print each url

-t <timeout>
--timeout <timeout>
      Set the timeout (in seconds)

-h
--help
      To print this usage message
      """

      root=""
      server=""
      tobrowse=[]
      browsed=[]
      proxy={}
      excluded=[]
      forms=[]
      uploads=[]
      allowed=['php','html','htm','xml','xhtml','xht','xhtm',
               'asp','aspx','php3','php4','php5','txt','shtm',
             'shtml','phtm','phtml','jhtml','pl','jsp','cfm','cfml']
      verbose=0
      cookie=""
      auth_basic=[]
      bad_params=[]
      timeout=6

      def __init__(self,rooturl):
            root=rooturl
            if root[-1]!="/":
                  root+="/"
            if(self.checklink(root)):
                  print "Invalid link argument"
                  sys.exit(0)

            server=(root.split("://")[1]).split("/")[0]
            self.root=root
            self.server=server

            self.tobrowse.append(root)
      
00131       def setTimeOut(self,timeout=6):
            """Set the timeout in seconds to wait for a page"""
            self.timeout=timeout

00135       def setProxy(self,proxy={}):
            """Set proxy preferences"""
            self.proxy=proxy

      def addStartURL(self,url):
            if(self.checklink(url)):
                  print "Invalid link argument:",url
                  sys.exit(0)
            if(self.inzone(url)==0):
                  self.tobrowse.append(url)

00146       def addExcludedURL(self,url):
            """Add an url to the list of forbidden urls"""
            self.excluded.append(url)

00150       def setCookieFile(self,cookie):
            """Set the file to read the cookie from"""
            self.cookie=cookie

      def setAuthCredentials(self,auth_basic):
            self.auth_basic=auth_basic

      def addBadParam(self,bad_param):
            self.bad_params.append(bad_param)

00160       def browse(self,url):
            """Extract urls from a webpage and add them to the list of urls to browse if they aren't in the exclusion list"""
            # We don't need destination anchors
            current=url.split("#")[0]
            # Url without query string
            current=current.split("?")[0]
            # Get the dirname of the file
            currentdir="/".join(current.split("/")[:-1])+"/"

            # Timeout must not be too long to block big documents (for exemple a download script)
            # and not too short to give good results
            socket.setdefaulttimeout(self.timeout)
            try:
                  req = urllib2.Request(url)
                  u = urllib2.urlopen(req)
            # BadStatusLine can happen when no HTTP status code is given or when a connexion is suddenly closed
            except urllib2.httplib.BadStatusLine:
                  print "Error reading response"
                  return 0
            except IOError,e:
                    print "\n"+url+":",e
                  self.excluded.append(url)
                  return 0
            proto=url.split("://")[0]
            if proto=="http" or proto=="https":
                  # Check the content-type first
                  if not u.info().get("Content-Type"):
                        # Sometimes there's no content-type... so we rely on the document extension
                        if (current.split(".")[-1] not in self.allowed) and current[-1]!="/":
                              return 1
                  elif u.info().get("Content-Type").find("text")==-1:
                        return 1
            # Manage redirections
            if u.headers.dict.has_key("location"):
                  redir=self.correctlink(u.headers.dict["location"],current,currentdir,proto)
                  if redir!=None:
                        if(self.inzone(redir)==0):
                              # Is the document already visited of forbidden ?
                              if (redir in self.browsed) or (redir in self.tobrowse) or self.isExcluded(redir):
                                    pass
                              else:
                                    # No -> Will browse it soon
                                    self.tobrowse.append(redir)
            try:
                  htmlSource=u.read()
            except socket.timeout:
                  htmlSource=""
            p=linkParser()
            try:
                  p.feed(htmlSource)
            except HTMLParser.HTMLParseError,err:
                  if tidyhere==1:
                        options = dict(output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0)
                        htmlSource=str(tidy.parseString(htmlSource,**options))
                        try:
                              p.reset()
                              p.feed(htmlSource)
                        except HTMLParser.HTMLParseError,err:
                              pass
                  elif BeautifulSouphere==1:
                        htmlSource=BeautifulSoup.BeautifulSoup(htmlSource).prettify()
                        try:
                              p.reset()
                              p.feed(htmlSource)
                        except HTMLParser.HTMLParseError,err:
                              pass
                  # last chance
                  else:
                        p.liens=re.findall('href="(.*?)"',htmlSource)

            for lien in p.uploads:
                  self.uploads.append(self.correctlink(lien,current,currentdir,proto))
            for lien in p.liens:
                  lien=self.correctlink(lien,current,currentdir,proto)
                  if lien!=None:
                        if(self.inzone(lien)==0):
                              # Is the document already visited of forbidden ?
                              if (lien in self.browsed) or (lien in self.tobrowse) or self.isExcluded(lien):
                                    pass
                              else:
                                    # No -> Will browse it soon
                                    self.tobrowse.append(lien)
            for form in p.forms:
                  action=self.correctlink(form[0],current,currentdir,proto)
                  if action==None: action=current
                  form=(action,form[1],url)
                  if form not in self.forms: self.forms.append(form)
            # We automaticaly exclude 404 urls
            if u.code==404:
                  self.excluded.append(url)
                  return 0
            return 1
      

00254       def correctlink(self,lien,current,currentdir,proto):
            """Transform relatives urls in absolutes ones"""
            # No leading or trailing whitespaces
            lien=lien.strip()
            # bad protocols
            if lien.find("telnet:",0)==0 or lien.find("ftp:",0)==0 or lien.find("mailto:",0)==0 or \
            lien.find("javascript:",0)==0 or lien.find("news:",0)==0 or lien.find("file:",0)==0 or \
            lien.find("gopher:",0)==0 or lien.find("irc:",0)==0 or lien=="":
                  return None
            # Good protocols or relatives links
            else:
                  # full url, nothing to do :)
                  if (lien.find("http://",0)==0) or (lien.find("https://",0)==0):
                        pass
                  else:
                        # root-url related link
                        if(lien[0]=='/'):
                              lien=proto+"://"+self.server+lien
                        else:
                              # same page + query string
                              if(lien[0]=='?'):
                                    lien=current+lien
                              # current directory related link
                              else:
                                    lien=currentdir+lien
                  # No destination anchor
                  if lien.find("#")!=-1:
                        lien=lien.split("#")[0]
                  # reorganize parameters in alphabetical order
                  if lien.find("?") != -1:
                        args=lien.split("?")[1]
                        if args.find("&") != -1 :
                              args=args.split("&")
                              args.sort()
                              args=[i for i in args if i!="" and i.find("=")>=0]
                              for i in self.bad_params:
                                    for j in args:
                                          if j.startswith(i+"="): args.remove(j)
                              args="&".join(args)

                        # a hack for auto-generated Apache directory index
                        if args in ["C=D;O=A","C=D;O=D","C=M;O=A","C=M;O=D","C=N;O=A","C=N;O=D","C=S;O=A","C=S;O=D"]:
                              lien=lien.split("?")[0]
                        else:
                              lien=lien.split("?")[0]+"?"+args
                  # Remove the trailing '?' if its presence doesn't make sense
                  if lien[-1:]=="?":
                        lien=lien[:-1]
                  # remove useless slashes
                  if lien.find("?")!=-1:
                        file=lien.split("?")[0]
                        file=re.sub("[^:]//+","/",file)
                        lien=file+"?"+lien.split("?")[1]
                  # links going to a parrent directory (..)
                  while re.search("/([~:!,;a-zA-Z0-9\.\-+_]+)/\.\./",lien)!=None:
                        lien=re.sub("/([~:!,;a-zA-Z0-9\.\-+_]+)/\.\./","/",lien)
                  lien=re.sub("/\./","/",lien)
                  # Everything is good here
                  return lien

00314       def checklink(self,url):
            """Verify the protocol"""
            if (url.find("http://",0)==0) or (url.find("https://",0)==0):
                  return 0
            else:
                  return 1

00321       def inzone(self,url):
            """Make sure the url is under the root url"""
            if(url.find(self.root,0)==0):
                  return 0
            else:
                  return 1
      
00328       def isExcluded(self,url):
            """Return True if the url is not allowed to be scan"""
            match=False
            for regexp in self.excluded:
                  if self.reWildcard(regexp,url):
                        match=True
            return match
              
00336       def reWildcard(self,regexp,string):
            """Wildcard-based regular expression system"""
            regexp=re.sub("\*+","*",regexp)
            match=True
            if regexp.count("*")==0:
                  if regexp==string:
                        return True
                  else:
                        return False
            blocks=regexp.split("*")
            start=""
            end=""
            if not regexp.startswith("*"):
                  start=blocks[0]
            if not regexp.endswith("*"):
                  end=blocks[-1]
            if start!="":
                  if string.startswith(start):
                        blocks=blocks[1:]
                  else:
                        return False
            if end!="":
                  if string.endswith(end):
                        blocks=blocks[:-1]
                  else:
                        return False
            blocks=[block for block in blocks if block!=""]
            if blocks==[]:
                  return match
            for block in blocks:
                  i=string.find(block)
                  if i==-1: return False
                  string=string[i+len(block):]
            return match

      def go(self):
            director = urllib2.OpenerDirector()
            
            director.add_handler(urllib2.HTTPHandler())
            director.add_handler(urllib2.HTTPSHandler())

            if self.proxy!={}:
                  director.add_handler(urllib2.ProxyHandler(self.proxy))

            if self.auth_basic!=[]:
                  auth=urllib2.HTTPBasicAuthHandler(urllib2.HTTPPasswordMgrWithDefaultRealm())
                  auth.add_password(None, self.server, self.auth_basic[0], self.auth_basic[1])
                  director.add_handler(auth)

            if self.cookie!="" and cookielibhere==1:
                  cj = cookielib.LWPCookieJar()
                  if os.path.isfile(self.cookie):
                        cj.load(self.cookie,ignore_discard=True)
                        director.add_handler(urllib2.HTTPCookieProcessor(cj))

            urllib2.install_opener(director)
            # while url list isn't empty, continue browsing
                # if the user stop the scan with Ctrl+C, give him all found urls
                try:
              while len(self.tobrowse)>0:
                lien=self.tobrowse.pop(0)
                if (lien not in self.browsed):
                  if self.browse(lien):
                    self.browsed.append(lien)
                  if self.verbose==1:
                    sys.stderr.write('.')
                  elif self.verbose==2:
                    sys.stderr.write(lien+"\n")
                except KeyboardInterrupt: pass

00406       def verbosity(self,vb):
            """Set verbosity level"""
            self.verbose=vb

00410       def printLinks(self):
            """Print found URLs on standard output"""
            self.browsed.sort()
            sys.stderr.write("\n+ URLs :\n")
            for lien in self.browsed:
                  print lien

00417       def printForms(self):
            """Print found forms on standard output"""
            if self.forms!=[]:
                  sys.stderr.write("\n+ Forms Info :\n")
                  for form in self.forms:
                        print "From:",form[2]
                        print "To:",form[0]
                        for k,v in form[1].items():
                              print "\t"+k,":",v
                        print

00428       def printUploads(self):
            """Print urls accepting uploads"""
            if self.uploads!=[]:
                  sys.stderr.write("\n+ Upload Scripts :\n")
                  for up in self.uploads:
                        print up

      def getLinks(self):
            self.browsed.sort()
            return self.browsed

      def getForms(self):
            return self.forms

      def getUploads(self):
            self.uploads.sort()
            return self.uploads
      
00446 class linkParser(HTMLParser.HTMLParser):
      """Extract urls in 'a' href HTML tags"""
      def __init__(self):
            HTMLParser.HTMLParser.__init__(self)
            self.liens=[]
            self.forms=[]
            self.form_values={}
            self.inform=0
            self.current_form_url=""
            self.uploads=[]
            self.current_form_method="get"

      def handle_starttag(self,tag,attrs):
            tmpdict={}
            val=None
            for k,v in dict(attrs).items():
                  tmpdict[k.lower()]=v
            if tag.lower()=='a':
                  if "href" in tmpdict.keys():
                        self.liens.append(tmpdict['href'])
      
                if tag.lower()=='form':
                  self.inform=1
                  self.form_values={}
                  if "action" in tmpdict.keys():
                        self.liens.append(tmpdict['action'])
                        self.current_form_url=tmpdict['action']

                  # Forms use GET method by default
                  self.current_form_method="get"
                  if "method" in tmpdict.keys():
                        if tmpdict["method"].lower()=="post":
                              self.current_form_method="post"

            if tag.lower()=='input':
                  if self.inform==1:
                        if "type" not in tmpdict.keys():
                              tmpdict["type"]="text"
                        if "name" in tmpdict.keys():
                              if tmpdict['type'].lower() in ['text','password','radio','checkbox','hidden','submit','search']:
                              # use default value if present or set it to 'on'
                                    if "value" in tmpdict.keys():
                                          if tmpdict["value"]!="": val=tmpdict["value"]
                                          else: val="on"
                                    else: val="on"
                                    self.form_values.update(dict([(tmpdict['name'],val)]))
                              if tmpdict['type'].lower()=="file":
                                    self.uploads.append(self.current_form_url)

            if tag.lower() in ["textarea","select"]:
                  if self.inform==1:
                        if "name" in tmpdict.keys():
                              self.form_values.update(dict([(tmpdict['name'],'on')]))

            if tag.lower() in ["frame","iframe"]:
                  if "src" in tmpdict.keys():
                        self.liens.append(tmpdict['src'])

      def handle_endtag(self,tag):
            if tag.lower()=='form':
                  self.inform=0
                  if self.current_form_method=="post":
                        self.forms.append((self.current_form_url,self.form_values))
                  else:
                        l=["=".join([k,v]) for k,v in self.form_values.items()]
                        l.sort()
                        self.liens.append(self.current_form_url.split("?")[0]+"?"+"&".join(l))


if __name__ == "__main__":
  try:
      prox={}
      auth=[]
      if len(sys.argv)<2:
            print lswww.__doc__
            sys.exit(0)
      if '-h' in sys.argv or '--help' in sys.argv:
        print lswww.__doc__
        sys.exit(0)
      myls=lswww(sys.argv[1])
      myls.verbosity(1)
      try:
        opts, args = getopt.getopt(sys.argv[2:], "hp:s:x:c:a:r:v:t:",
            ["help","proxy=","start=","exclude=","cookie=","auth=","remove=","verbose=","timeout="])
      except getopt.GetoptError,e:
        print e
        sys.exit(2)
      for o,a in opts:
        if o in ("-h", "--help"):
          print lswww.__doc__
          sys.exit(0)
        if o in ("-s","--start"):
          if (a.find("http://",0)==0) or (a.find("https://",0)==0):
            myls.addStartURL(a)
        if o in ("-x","--exclude"):
          if (a.find("http://",0)==0) or (a.find("https://",0)==0):
            myls.addExcludedURL(a)
        if o in ("-p","--proxy"):
          if (a.find("http://",0)==0) or (a.find("https://",0)==0):
            prox={'http':a}
            myls.setProxy(prox)
        if o in ("-c","--cookie"):
          myls.setCookieFile(a)
        if o in ("-r","--remove"):
          myls.addBadParam(a)
        if o in ("-a","--auth"):
          if a.find("%")>=0:
            auth=[a.split("%")[0],a.split("%")[1]]
            myls.setAuthCredentials(auth)
        if o in ("-v","--verbose"):
          if str.isdigit(a):
            myls.verbosity(int(a))
        if o in ("-t","--timeout"):
          if str.isdigit(a):
            myls.setTimeOut(int(a))
      myls.go()
      myls.printLinks()
      myls.printForms()
      myls.printUploads()
  except SystemExit:
      pass

Generated by  Doxygen 1.6.0   Back to index