Help with cookies/authentication

Collapse
X
 
  • Time
  • Show
Clear All
new posts
  • trihaitran
    New Member
    • Feb 2008
    • 7

    Help with cookies/authentication

    Hi I am trying to pull some data from a Web site: http://schoolfinder.com

    The issue is that I want to use the advanced search feature which requires logging into the Web site. I have a username and password, however I want to connect programmaticall y from Python. I have done data capture from the Web before so the only new thing here to me is the authentication stuff. I need cookies as this page describes: http://schoolfinder.com/login/login.asp

    I already know how to enter POST/GET data to a request, but how do I deal with cookies/authentication? I have read a few articles without success:

    urllib2:


    urllib2 Cookbook:


    basic authentication:


    cookielib:


    Is there some other resource I am missing? Is it possible that someone could setup a basic script that would allow me to connect to schoolfinder.co m with my username and password? My username is "greenman", password is "greenman". All I need to know is how to access pages as if I logged in by Web browser.

    Thank you very much.
  • Formula
    New Member
    • Aug 2008
    • 11

    #2
    Try this code will give you all cookies will be registered in a file
    from the schoolfinder.co m

    Code:
    #!/usr/local/bin/python
    
    
    
    
    
    COOKIEFILE = 'cookies.lwp'          # the path and filename that you want to use to save your cookies in
    
    import os.path
    
    import sys
    
    
    
    cj = None
    
    ClientCookie = None
    
    cookielib = None
    
    
    
    try:                                    # Let's see if cookielib is available
    
        import cookielib            
    
    except ImportError:
    
        pass
    
    else:
    
        import urllib2    
    
        urlopen = urllib2.urlopen
    
        cj = cookielib.LWPCookieJar()       # This is a subclass of FileCookieJar that has useful load and save methods
    
        Request = urllib2.Request
    
    
    
    if not cookielib:                   # If importing cookielib fails let's try ClientCookie
    
        try:                                            
    
            import ClientCookie 
    
        except ImportError:
    
            import urllib2
    
            urlopen = urllib2.urlopen
    
            Request = urllib2.Request
    
        else:
    
            urlopen = ClientCookie.urlopen
    
            cj = ClientCookie.LWPCookieJar()
    
            Request = ClientCookie.Request
    
            
    
    ####################################################
    
    # We've now imported the relevant library - whichever library is being used urlopen is bound to the right function for retrieving URLs
    
    # Request is bound to the right function for creating Request objects
    
    # Let's load the cookies, if they exist. 
    
        
    
    if cj != None:                                  # now we have to install our CookieJar so that it is used as the default CookieProcessor in the default opener handler
    
        if os.path.isfile(COOKIEFILE):
    
            cj.load(COOKIEFILE)
    
        if cookielib:
    
            opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    
            urllib2.install_opener(opener)
    
        else:
    
            opener = ClientCookie.build_opener(ClientCookie.HTTPCookieProcessor(cj))
    
            ClientCookie.install_opener(opener)
    
    
    
    # If one of the cookie libraries is available, any call to urlopen will handle cookies using the CookieJar instance we've created
    
    # (Note that if we are using ClientCookie we haven't explicitly imported urllib2)
    
    # as an example :
    
    
    
    theurl = 'http://schoolfinder.com/login/login.asp'         # an example url that sets a cookie, try different urls here and see the cookie collection you can make !
    body={'usr':'greenman','pwd':'greenman'}
    
    from urllib import urlencode
    
    
    txdata = urlencode(body)                                                                           # if we were making a POST type request, we could encode a dictionary of values here - using urllib.urlencode
    
    txheaders =  {'User-agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}          # fake a user agent, some websites (like google) don't like automated exploration
    
    
    
    try:
    
        req = Request(theurl, txdata, txheaders)            # create a request object
    
        handle = urlopen(req)                               # and open it to return a handle on the url
    
    except IOError, e:
    
        print 'We failed to open "%s".' % theurl
    
        if hasattr(e, 'code'):
    
            print 'We failed with error code - %s.' % e.code
    
        elif hasattr(e, 'reason'):
    
            print "The error object has the following 'reason' attribute :", e.reason
    
            print "This usually means the server doesn't exist, is down, or we don't have an internet connection."
    
            sys.exit()
    
            
    
    else:
    
        print 'Here are the headers of the page :'
    
        print handle.info()                             # handle.read() returns the page, handle.geturl() returns the true url of the page fetched (in case urlopen has followed any redirects, which it sometimes does)
    
    
    
    print
    
    if cj == None:
    
        print "We don't have a cookie library available - sorry."
    
        print "I can't show you any cookies."
    
    else:
    
        print 'These are the cookies we have received so far :'
    
        for index, cookie in enumerate(cj):
    
            print index, '  :  ', cookie        
    
        cj.save(COOKIEFILE)                     # save the cookies again

    Comment

    • trihaitran
      New Member
      • Feb 2008
      • 7

      #3
      Thanks for the help. Your code by itself did not work, but it pushed me in the right direction. Here is what worked for me and let me see the protected pages:

      Code:
      #!/usr/bin/env python
      # -*- coding: UTF-8 -*-
      
      import cookielib
      import urllib
      import urllib2
      
      cj = cookielib.CookieJar()
      opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
      resp = opener.open('http://schoolfinder.com') # save a cookie
      
      theurl = 'http://schoolfinder.com/login/login.asp' # an example url that sets a cookie, try different urls here and see the cookie collection you can make !
      body={'usr':'greenman','pwd':'greenman'}
      txdata = urllib.urlencode(body) # if we were making a POST type request, we could encode a dictionary of values here - using urllib.urlencode
      txheaders =  {'User-agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} # fake a user agent, some websites (like google) don't like automated exploration
      
      
      try:
          req = urllib2.Request(theurl, txdata, txheaders) # create a request object
          handle = opener.open(req) # and open it to return a handle on the url
          HTMLSource = handle.read()
          f = file('test.html', 'w')
          f.write(HTMLSource)
          f.close()
      
      except IOError, e:
          print 'We failed to open "%s".' % theurl
          if hasattr(e, 'code'):
              print 'We failed with error code - %s.' % e.code
          elif hasattr(e, 'reason'):
              print "The error object has the following 'reason' attribute :", e.reason
              print "This usually means the server doesn't exist, is down, or we don't have an internet connection."
              sys.exit()
      
      else:
          print 'Here are the headers of the page :'
          print handle.info() # handle.read() returns the page, handle.geturl() returns the true url of the page fetched (in case urlopen has followed any redirects, which it sometimes does)

      Comment

      • johnpollard
        New Member
        • Oct 2008
        • 2

        #4
        Your script works for me, but the one below for another site does not. The test.html file is not my logged in file like it is when I run your script.

        The only lines of code I changed are;
        resp = opener.open('ht tp://www.amm.com/')
        theurl = 'http://www.amm.com/login.asp'
        body={'username ':'AMMT54590570 ','password':'A MMT32564288'}

        What am I doing wrong?

        -----------------------------------
        Code:
        #!/usr/bin/env python
        # -*- coding: UTF-8 -*-
         
        import cookielib
        import urllib
        import urllib2
         
        cj = cookielib.CookieJar()
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
        resp = opener.open('http://www.amm.com/login.asp') # save a cookie
         
        theurl = 'http://www.amm.com/login.asp'
        # an example url that sets a cookie, try different urls here and see the cookie collection you can make !
        body={'username':'AMMT54590570','password':'AMMT32564288'}
        txdata = urllib.urlencode(body)
        # if we were making a POST type request, we could encode a dictionary of values here - using urllib.urlencode
        txheaders =  {'User-agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
        # fake a user agent, some websites (like google) don't like automated exploration
         
         
        try:
            req = urllib2.Request(theurl, txdata, txheaders) # create a request object
            handle = opener.open(req) # and open it to return a handle on the url
            HTMLSource = handle.read()
            f = file('test.html', 'w')
            f.write(HTMLSource)
            f.close()
         
        except IOError, e:
            print 'We failed to open "%s".' % theurl
            if hasattr(e, 'code'):
                print 'We failed with error code - %s.' % e.code
            elif hasattr(e, 'reason'):
                print "The error object has the following 'reason' attribute :", e.reason
                print "This usually means the server doesn't exist, is down, or we don't have an internet connection."
                sys.exit()
         
        else:
            print 'Here are the headers of the page :'
            print handle.info() # handle.read() returns the page, handle.geturl() returns the true url of the page fetched (in case urlopen has followed any redirects, which it sometimes does)

        Comment

        Working...