不怎么会用PYTHON写。。今天试了一下。
#!/usr/bin/python# vim: set fileencoding=utf-8:import sysimport urllib2import reimport sqlite3import hashlibimport randomfrom BeautifulSoup import BeautifulSoupclass SpriderUrl: # 初始化 def __init__(self,url,domain_name): self.url=url self.domain_name=domain_name # 获得URL列表 def getUrl(self): urls=[] # try: body_text=urllib2.urlopen(self.url).read() soup=BeautifulSoup(body_text) links=soup.findAll('a') # connect sqllite3 md5_str=hashlib.md5(str(random.randint(1,100000))+"aa") print "data_name:"+md5_str.hexdigest() # create sqlite3 data name con=sqlite3.connect(md5_str.hexdigest()+".db") # create sqlite3 table name con.execute("""create table url_data(id interger auto_increment primary key,url TEXT not null)""") for link in links: if re.match('(.*)\:\/\/'+self.domain_name,link.get('href')): urls.append(link.get('href')) con.execute("insert into url_data(url)values('"+link.get('href')+"')") con.commit() while len(urls)>0: for url in urls: body_text2=urllib2.urlopen(url).read() soup2=BeautifulSoup(body_text2) links2=soup2.findAll('a') for link2 in links2: if re.match('(.*)\:\/\/'+self.domain_name,link2.get('href')): test=link2.get('href') cur=con.execute("select * from url_data where url='"+test+"'") bool_itm=cur.fetchone() if bool_itm is None: urls.append(link2.get('href')) con.execute("insert into url_data(url)values('"+test+"')") else: continue else: continue print "Done"t=SpriderUrl('http://www.baidu.com/',"www.baidu.com")t.getUrl()