import urllib import string import htmllib import formatter query_string = 'Cottonwood+arizona+geology+kids+winter+snowshoe' page_URL = 'http://google.yahoo.com/bin/query?p=' + query_string # page_URL = 'http://google.yahoo.com/bin/query?p=cottonwood+az&hc=1&hs=9' try: search_page = urllib.urlopen(page_URL) except IOError: print 'Error: ' + page_URL end page_list = search_page.readlines() search_page.close() flag = 0 count = 0 link_number_line = '' link_number_string = '' links = open('c:\\links.txt','r+') links2 = open('c:\\links2.txt','r+') total_links = 0 start_point = 0 page_text = '' def write_links(page_list): refference = '' ln = '' flag = 0 for ln in page_list: if string.find(ln,'Web Page Matches') <> -1: flag =1 if flag == 1: if string.find(ln,'a href="') <> -1: refference = ln[string.find(ln,'a href="') + 8: string.rfind(ln,'"')] if string.find(refference,'srd.yahoo.com') <> -1: links.write(refference[string.find(refference,'*')+1:] + '\n') class tomParser(htmllib.HTMLParser): def __init__(self, formatter): htmllib.HTMLParser.__init__(self, formatter) # flag to determine if we are in an anchor tag self.in_anchor = 0 self.headflag = 0 self.bodyflag = 0 self.titleflag = 0 self.link_list = [] def start_head(self, attrs): self.headflag = 1 def end_head(self): self.headflag = 0 def start_body(self, attrs): self.bodyflag = 1 self.link_list attrs def end_body(self): self.bodyflag = 0 def start_title(self, attrs): self.titleflag = 1 def end_title(self): self.titleflag = 0 #def start_(self, attrs): #def end_(self, attrs): def start_a(self, attrs): # """Signal when we get to an tag. # """ self.in_anchor = 1 print 'Anchor: ',attrs def end_a(self): # """Signal when we are out of the anchor -- a tag""" self.in_anchor = 0 def handle_data(self, text): #print self.headflag,self.bodyflag,self.titleflag """This is called everytime we get to text data (ie. not tags) """ if self.in_anchor: print text #if self.headflag: # print text if self.bodyflag: if string.strip(text) <> '/n' or '' or chr(160): print string.strip(text) if len(string.strip(text)) == 1: print ord(string.strip(text)) if self.titleflag: print 'Title: ' , text page_format = formatter.NullFormatter() page_parser = tomParser(page_format) for ln in page_list: if string.find(ln,'Web Page Matches') <> -1: flag =1 count = 1 if flag == 1 and count <> 1: link_number_line = ln if flag == 1 and count == 2: flag = 0 count = 0 count = count +1 print link_number_line for ch in link_number_line: if ch in string.digits: link_number_string = link_number_string + ch total_links = int(link_number_string) for c in range(1,total_links,20): page_URL = 'http://google.yahoo.com/bin/query?p=' + query_string + '&b=' + str(c) try: search_page = urllib.urlopen(page_URL) except IOError: print 'Error: ' + page_URL page_list = search_page.readlines() search_page.close() write_links(page_list) links.seek(0) page_URL = links.readline() while page_URL <> '': start_point = 0 print page_URL try: search_page = urllib.urlopen(page_URL) except IOError: print 'Error: ' + page_URL page_URL = links.readline() else: page_text = search_page.read() page_parser.feed(page_text) number_query = string.count(page_text,query_string[:string.find(query_string,'+')]) for i in range(number_query): start_point = start_point + string.find(page_text,query_string[:string.find(query_string,'+')],start_point) info_line = page_text[(start_point - string.rfind(page_text[:start_point],'>')):(start_point + string.find(page_text[:start_point],'<'))] links2.write(info_line) page_URL = links.readline() links.close() links2.close()