import urllib2 import re import sys def get_text(f1): #(f1) h1 = f1.read() #f1.read() html = '' # h3 is a string h2 = h1.split('\n') for h3 in h2: html += h3 html += ' ' capture = True filtered_text = '' html_text = html.split('<') for h in html_text: h = '<' + h h_text = h.split('>') for w in h_text: if w: if w[0] == '<': w += '>' if re.search(r'', w): capture = True elif re.search(r'', h) or re.search(r'', h) or re.search(r'', h) or re.search(r'', h): capture = True elif re.search(r'', h) or re.search(r'', h) or re.search(r'', h) or re.search(r'', h) or re.search(r'', h): capture = True elif re.search(r'', h): capture = True elif re.search(r'', h, re.IGNORECASE) or re.search(r'', h, re.IGNORECASE) or re.search(r'', h, re.IGNORECASE) or re.search(r'', h, re.IGNORECASE) or re.search(r'', h, re.IGNORECASE) or re.search(r'', h, re.IGNORECASE) or re.search(r'', h, re.IGNORECASE) or re.search('
', h, re.IGNORECASE): pass elif re.search(r'', h, re.IGNORECASE) or re.search(r'', h, re.IGNORECASE) or re.search(r''): filtered_text += '\n' filtered_text += h filtered_text += '\n' elif h.startswith(''): filtered_text += '\n' filtered_text += h filtered_text += '\n' else: filtered_text += ' ' filtered_text += h html_text = filtered_text.split('\n') filtered_text = '' length = [] line = [] ratio = [0] total = 1 #print html_text for h in html_text: if re.search(r'', h) and len(length)!= 0 : total -= length.pop() elif re.search(r'<(.*)>', h): total += len(h) length.append(len(h)) else: if h and not h.isspace(): h4 = h.split(' ') h = '' for h5 in h4: if h5.startswith('&') and h5.endswith(';') or h5.endswith('|'): pass else: h += h5 + ' ' ratio.append(float(len(h))/total) line.append(h) max_r = max(ratio) i = -1 text = '' for r in ratio: if r>0: i = i + 1 if r > max_r*0.45: text += line[i] + '\n' elif r > max_r*0.3: if line[i].endswith('.'): text += line[i] + '\n' else: pass print text def get_url_text(url): proxy = urllib2.ProxyHandler({'http': 'http://068.472305051748:683954sanjay@10.1.1.18:80', 'https': 'https://068.472305051748:683954sanjay@10.1.1.18:80'}) auth = urllib2.HTTPBasicAuthHandler() opener = urllib2.build_opener(proxy, auth, urllib2.HTTPHandler) urllib2.install_opener(opener) try : f = urllib2.urlopen(url) except (urllib2.HTTPError,urllib2.URLError) : return '\n' else: return get_text(f) def main(): get_url_text(sys.argv[1]) if __name__ == "__main__": main()