RE: xpath questions...

Collapse
This topic is closed.
X
X
 
  • Time
  • Show
Clear All
new posts
  • bruce

    RE: xpath questions...

    valid point...!!

    here's the test python.. ugly as it is!!

    #!/usr/bin/python
    #
    # test.py
    #
    # scrapes/extracts the basic data for the college
    #
    #
    # the app gets/stores
    # name
    # url
    # address (street/city/state
    # phone
    #
    ############### ############### ############### ############### ##########3
    #test python script
    import re
    import libxml2dom
    import urllib
    import urllib2
    import sys, string
    from mechanize import Browser
    import mechanize
    #import tidy
    import os.path
    import cookielib
    from libxml2dom import Node
    from libxml2dom import NodeList
    import subprocess
    import MySQLdb
    #import mysql_config
    import time


    ############### #########
    #
    # Parse pricegrabber.co m
    ############### #########


    urlopen = urllib2.urlopen
    ##cj = urllib2.cookiel ib.LWPCookieJar ()
    Request = urllib2.Request
    br = Browser()
    br2 = Browser()


    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    values1 = {'name' : 'Michael Foord',
    'location' : 'Northampton',
    'language' : 'Python' }
    headers = { 'User-Agent' : user_agent }

    url="http://schedule.berkel ey.edu/"
    url="http://schedule.psu.ed u/"
    #============== =============== ==========


    if __name__ == "__main__":
    # main app

    txdata = None

    #----------------------------
    # get the kentucky test pages

    #br.set_cookiej ar(cj)
    br.set_handle_r edirect(True)
    br.set_handle_r eferer(True)
    br.set_handle_r obots(False)
    br.addheaders = [('User-Agent', 'Firefox')]


    #cnt is the page count for the master url
    murl=url

    print "url =",murl
    br.open(murl)
    #cj.save(COOKIE FILE) # resave cookies

    res = br.response() # this is a copy of response
    s = res.read()

    # s contains HTML not XML text
    d = libxml2dom.pars eString(s, html=1)


    #get the input/text dialogs
    #tn1 = "//div[@id='main_conte nt']/form[1]/input[position()=1]/@name"
    q="//img/parent::*/attribute::href "
    q="//form[@name='cos_sear ch1']/@action"

    t1=d.xpath(q)
    print "href = ",t1
    print "hnode =",t1[0].nodeValue
    print "htest =",t1[0].textContent
    print "htesttt =",t1[0].toString()

    sys.exit()

    thanks!!


    -----Original Message-----
    From: python-list-bounces+bedougl as=earthlink.ne t@python.org
    [mailto:python-list-bounces+bedougl as=earthlink.ne t@python.org]On Behalf
    Of Fredrik Lundh
    Sent: Saturday, August 23, 2008 5:58 AM
    To: python-list@python.org
    Subject: Re: xpath questions...


    bruce wrote:
    Regarding the xpath question I've posed, some have said that it shouldn't
    be
    here on the mailing list. Give that I'm writing the test scripts/apps in
    python, using the python libs, where else should it be posted?
    >
    I mean, I could post the entire sample script so you can see that it's
    using
    python, but I simplified the issue.
    there was zero Python content left after the simplification. maybe you
    should at least mention what library you're using to "play around with
    xpath and the html dom" ?

    </F>

    --


Working...