Logo Search packages:      
Sourcecode: wapiti version File versions  Download package

def lswww::lswww::browse (   self,
  url 
)

Extract urls from a webpage and add them to the list of urls to browse if they aren't in the exclusion list

Definition at line 160 of file lswww.py.

00160                           :
            """Extract urls from a webpage and add them to the list of urls to browse if they aren't in the exclusion list"""
            # We don't need destination anchors
            current=url.split("#")[0]
            # Url without query string
            current=current.split("?")[0]
            # Get the dirname of the file
            currentdir="/".join(current.split("/")[:-1])+"/"

            # Timeout must not be too long to block big documents (for exemple a download script)
            # and not too short to give good results
            socket.setdefaulttimeout(self.timeout)
            try:
                  req = urllib2.Request(url)
                  u = urllib2.urlopen(req)
            # BadStatusLine can happen when no HTTP status code is given or when a connexion is suddenly closed
            except urllib2.httplib.BadStatusLine:
                  print "Error reading response"
                  return 0
            except IOError,e:
                    print "\n"+url+":",e
                  self.excluded.append(url)
                  return 0
            proto=url.split("://")[0]
            if proto=="http" or proto=="https":
                  # Check the content-type first
                  if not u.info().get("Content-Type"):
                        # Sometimes there's no content-type... so we rely on the document extension
                        if (current.split(".")[-1] not in self.allowed) and current[-1]!="/":
                              return 1
                  elif u.info().get("Content-Type").find("text")==-1:
                        return 1
            # Manage redirections
            if u.headers.dict.has_key("location"):
                  redir=self.correctlink(u.headers.dict["location"],current,currentdir,proto)
                  if redir!=None:
                        if(self.inzone(redir)==0):
                              # Is the document already visited of forbidden ?
                              if (redir in self.browsed) or (redir in self.tobrowse) or self.isExcluded(redir):
                                    pass
                              else:
                                    # No -> Will browse it soon
                                    self.tobrowse.append(redir)
            try:
                  htmlSource=u.read()
            except socket.timeout:
                  htmlSource=""
            p=linkParser()
            try:
                  p.feed(htmlSource)
            except HTMLParser.HTMLParseError,err:
                  if tidyhere==1:
                        options = dict(output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0)
                        htmlSource=str(tidy.parseString(htmlSource,**options))
                        try:
                              p.reset()
                              p.feed(htmlSource)
                        except HTMLParser.HTMLParseError,err:
                              pass
                  elif BeautifulSouphere==1:
                        htmlSource=BeautifulSoup.BeautifulSoup(htmlSource).prettify()
                        try:
                              p.reset()
                              p.feed(htmlSource)
                        except HTMLParser.HTMLParseError,err:
                              pass
                  # last chance
                  else:
                        p.liens=re.findall('href="(.*?)"',htmlSource)

            for lien in p.uploads:
                  self.uploads.append(self.correctlink(lien,current,currentdir,proto))
            for lien in p.liens:
                  lien=self.correctlink(lien,current,currentdir,proto)
                  if lien!=None:
                        if(self.inzone(lien)==0):
                              # Is the document already visited of forbidden ?
                              if (lien in self.browsed) or (lien in self.tobrowse) or self.isExcluded(lien):
                                    pass
                              else:
                                    # No -> Will browse it soon
                                    self.tobrowse.append(lien)
            for form in p.forms:
                  action=self.correctlink(form[0],current,currentdir,proto)
                  if action==None: action=current
                  form=(action,form[1],url)
                  if form not in self.forms: self.forms.append(form)
            # We automaticaly exclude 404 urls
            if u.code==404:
                  self.excluded.append(url)
                  return 0
            return 1
      

      def correctlink(self,lien,current,currentdir,proto):


Generated by  Doxygen 1.6.0   Back to index