Simple Tag Getter with lxml


#!/usr/bin/env python
#! -*- coding: utf-8 -*-

import sys
import urllib2
import lxml.html
def get_texts(url, tag):
        html = urllib2.urlopen(url).read()
    except urllib2.HTTPError, e:
        # If returned 404
        #print e.read()
        print "Can't access to the given URL."
    root = lxml.html.fromstring(html)
    anchors = root.xpath(tag)

    text_list = []
    for a in anchors:
        text = lxml.html.tostring(a, method='text', encoding='utf-8')
        #print text.strip('\t\n')
    if not text_list:
        text = "There are no tags in this page"
    return text_list

def create_xpath(tag, attr):
    xpath = "//"
    xpath += tag
    if not attr == "":
        xpath += "[@"
        attr,value = attr.split("=")
        xpath += attr
        xpath += "=\"" + value + "\"]"
    print "Created Xpath: " + xpath
    return xpath

if __name__ == "__main__":
    argv_len = len(sys.argv)
    if not (argv_len == 3 or argv_len == 4):
        print "Usage: python tag-getter.py URL TAG [ATTR]"
    url = sys.argv[1].lower()
    tag = sys.argv[2].lower()
    if argv_len == 4:
        attr = sys.argv[3].lower()
        attr = ""
    xpath = create_xpath(tag, attr)
    text_list = get_texts(url, xpath)
    for t in text_list:
        print t

Usage Example:

  $ python tag-getter.py http://ebooks.adelaide.edu.au/c/carroll/lewis/alice/chapter1.html div class=dochead

Created Xpath: //div[@class="dochead"]
Alice in Wonderland, by Lewis Carroll

  $ python tag-getter.py http://ebooks.adelaide.edu.au/c/carroll/lewis/alice/chapter1.html p

Created Xpath: //p
Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice
she had peeped into the book her sister was reading, but it had no pictures or conversations in it, ‘and what is the use of
a book,’ thought Alice ‘without pictures or conversation?’

... 後略 ...

