python 网页抓取经验

来源:百度文库 编辑:神马文学网 时间:2024/04/28 08:25:12
一定要设置好user-agnet 和accept
Java代码
  1. #coding:utf-8  
  2. '''   
  3. Created on 2009-7-15  
  4.   
  5. @author: Administrator   
  6. '''   
  7. import urllib2   
  8. import newhttplib   
  9. import lxml.html as x     
  10.   
  11. def getmusic(num,soc=None):   
  12.     s=num.split(':')   
  13.     s[0]=s[0].strip()   
  14.     s[1]=s[1].strip()   
  15.     s[2]=s[2].strip()   
  16.     h1 = newhttplib.HTTPConnection('10.0.0.172',80)   
  17.     hs={'Accept''text/html,application/xhtml+xml,application/xml','User-Agent''Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5'}   
  18.     h1.auto_open = 0  
  19.     h1.connect(soc)   
  20.     url='http://218.200.160.29/s3/i/app/search/musicResult.jsp?qd=1956&CH=12530-wap-lslb&v=1864&tplpath=/s3/i/qrc/&type=all&keyword='+urllib2.quote(s[0].encode('utf-8')+" "+s[1].encode('utf-8'))    
  21.     h1.request("GET",url,headers=hs)    
  22.     r1 = h1.getresponse()   
  23.     print r1.getheader('Content-Type')   
  24.     content=r1.read()      
  25.     #content=content.decode('utf-8')   
  26.     doc=x.document_fromstring(content)     
  27.     alist=doc.xpath("//a")   
  28.     print s[0],s[1],len(alist)   
  29.     for a in alist:   
  30.         c1= a.text_content()   
  31.         try:   
  32.             if c1.find(s[0])>=0 and c1.find(s[1])>=0:   
  33.                 print c1   
  34.                 print c1.find(s[0]),c1.find(s[1])   
  35.                 nexturl='http://218.200.160.29'+a.get('href')    
  36.                 print nexturl   
  37.                 h1.request("GET",nexturl,headers=hs)    
  38.                 r1 = h1.getresponse()   
  39.                 content=r1.read()   
  40.                 print r1.getheaders()   
  41.                 #content=content.decode('utf-8')   
  42.                 print content   
  43.                 f=open('./b.html','wb')   
  44.                 f.write(content)   
  45.                 f.write(nexturl)   
  46.                 f.close()   
  47.                 doc=x.document_fromstring(content)     
  48.                 blist=doc.xpath("//a")   
  49.                 print 'blen=',len(blist)   
  50.                 for b in blist:   
  51.                     bcontent=b.text_content()   
  52.                     if bcontent.find(u'高潮版')>=0:   
  53.                          print bcontent   
  54.                          bhref= b.get('href')   
  55.                          nnexturl='http://218.200.160.29'+bhref   
  56.                          h1.request("GET",nnexturl,headers=hs)    
  57.                          r1 = h1.getresponse()   
  58.                          ccontent=r1.read()   
  59.                          #content=content.decode('utf-8')   
  60.                          #print ccontent   
  61.                          doc=x.document_fromstring(ccontent)     
  62.                          clist=doc.xpath("//a")   
  63.                          print 'clen=',len(clist)   
  64.                          for c in clist:   
  65.                              chref=c.get('href')   
  66.                              if chref.find('218.200.160.10')>=0:   
  67.                                  print chref   
  68.                                  sb2=s[2][11:]   
  69.                                  print sb2   
  70.                                  if chref.find(sb2)>=0:   
  71.                                      print u'找到匹配歌曲开始下载。。。'  
  72.                                      h1.request("GET",chref)    
  73.                                      r1 = h1.getresponse()   
  74.                                      print r1.status   
  75.                                      print r1.getheaders()   
  76.         except BaseException,e:   
  77.             print e   
  78.                
  79. if __name__ == '__main__':   
  80.     #f=open('./test1.htm','wb')   
  81.     #con=f.read()     
  82.     #print con   
  83.     getmusic('相信:曾建军:600902000005714466')   
  84.