import urllib
import string
import htmllib
import formatter
query_string = 'Cottonwood+arizona+geology+kids+winter+snowshoe'
page_URL = 'http://google.yahoo.com/bin/query?p=' + query_string
# page_URL = 'http://google.yahoo.com/bin/query?p=cottonwood+az&hc=1&hs=9'
try:
search_page = urllib.urlopen(page_URL)
except IOError:
print 'Error: ' + page_URL
end
page_list = search_page.readlines()
search_page.close()
flag = 0
count = 0
link_number_line = ''
link_number_string = ''
links = open('c:\\links.txt','r+')
links2 = open('c:\\links2.txt','r+')
total_links = 0
start_point = 0
page_text = ''
def write_links(page_list):
refference = ''
ln = ''
flag = 0
for ln in page_list:
if string.find(ln,'Web Page Matches') <> -1:
flag =1
if flag == 1:
if string.find(ln,'a href="') <> -1:
refference = ln[string.find(ln,'a href="') + 8: string.rfind(ln,'"')]
if string.find(refference,'srd.yahoo.com') <> -1:
links.write(refference[string.find(refference,'*')+1:] + '\n')
class tomParser(htmllib.HTMLParser):
def __init__(self, formatter):
htmllib.HTMLParser.__init__(self, formatter)
# flag to determine if we are in an anchor tag
self.in_anchor = 0
self.headflag = 0
self.bodyflag = 0
self.titleflag = 0
self.link_list = []
def start_head(self, attrs):
self.headflag = 1
def end_head(self):
self.headflag = 0
def start_body(self, attrs):
self.bodyflag = 1
self.link_list attrs
def end_body(self):
self.bodyflag = 0
def start_title(self, attrs):
self.titleflag = 1
def end_title(self):
self.titleflag = 0
#def start_(self, attrs):
#def end_(self, attrs):
def start_a(self, attrs):
# """Signal when we get to an tag.
# """
self.in_anchor = 1
print 'Anchor: ',attrs
def end_a(self):
# """Signal when we are out of the anchor -- a tag"""
self.in_anchor = 0
def handle_data(self, text):
#print self.headflag,self.bodyflag,self.titleflag
"""This is called everytime we get to text data (ie. not tags) """
if self.in_anchor:
print text
#if self.headflag:
# print text
if self.bodyflag:
if string.strip(text) <> '/n' or '' or chr(160):
print string.strip(text)
if len(string.strip(text)) == 1:
print ord(string.strip(text))
if self.titleflag:
print 'Title: ' , text
page_format = formatter.NullFormatter()
page_parser = tomParser(page_format)
for ln in page_list:
if string.find(ln,'Web Page Matches') <> -1:
flag =1
count = 1
if flag == 1 and count <> 1:
link_number_line = ln
if flag == 1 and count == 2:
flag = 0
count = 0
count = count +1
print link_number_line
for ch in link_number_line:
if ch in string.digits:
link_number_string = link_number_string + ch
total_links = int(link_number_string)
for c in range(1,total_links,20):
page_URL = 'http://google.yahoo.com/bin/query?p=' + query_string + '&b=' + str(c)
try:
search_page = urllib.urlopen(page_URL)
except IOError:
print 'Error: ' + page_URL
page_list = search_page.readlines()
search_page.close()
write_links(page_list)
links.seek(0)
page_URL = links.readline()
while page_URL <> '':
start_point = 0
print page_URL
try:
search_page = urllib.urlopen(page_URL)
except IOError:
print 'Error: ' + page_URL
page_URL = links.readline()
else:
page_text = search_page.read()
page_parser.feed(page_text)
number_query = string.count(page_text,query_string[:string.find(query_string,'+')])
for i in range(number_query):
start_point = start_point + string.find(page_text,query_string[:string.find(query_string,'+')],start_point)
info_line = page_text[(start_point - string.rfind(page_text[:start_point],'>')):(start_point + string.find(page_text[:start_point],'<'))]
links2.write(info_line)
page_URL = links.readline()
links.close()
links2.close()