#主要是爬取后给别人做自然语言分析,没其他意思。 #coding=utf8import requests,refrom lxml import etreeimport sysreload(sys)sys.setdefaultencoding('utf8')sys.setrecursionlimit(10000000) #解决maximum recursion depth exceeded in cmpdef craw(url): headerx={ 'Cookie':'bid=OIBtzThxxA; ct=y; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1502186407%2C%22http%3A%2F%2Fqianxun.baidu.com%2Fmovie%2Fcard_2162.html%22%5D; __utmt=1; ps=y; dbcl2="165xxx93:UV/wbzXasBQ"; ck=d-ep; _pk_id.100001.4cf6=7bff167cf6dxxxxxx.10.1502186411; __utmc=30149280; __utmz=30149280.1501649654.7.7.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=22369xxxxx59553.1502186411.2; __utmb=223695111.48.10.1502186411; __utmc=223695111; __utmz=223695111.1500959553.1.1.utmcsr=qianxun.baidu.com|utmccn=(referral)|utmcxxxxx.html; push_noty_num=0; push_doumail_num=0; ap=1' } while 1: try: resp=requests.get(url,headers=headerx) if resp.status_code==200: flag=1 break else: pass except Exception,e: print e selector=etree.HTML(resp.content.decode('utf8')) #print resp.content.decode('utf8') all_comment=selector.xpath('//div[@class="comment-item"]') for comment in all_comment: #print etree.tounicode(comment),'************************' star_class=comment.xpath('.//span[contains(@class,"allstar")]/@class') if star_class: starx=re.findall('tar(.*?)0',star_class[0])[0] else: starx=0 #有的评论没有打星 textx=comment.xpath('.//div[@class="comment"]/p/text()')[0] print starx,'星 ',textx f.write('%s星 %s\r\n'%(starx,textx)) next_start=re.search(u'前页[\s\S]*?
豆瓣的影评,每一页的页面链接不是可以预知的,url中start从0开始,第二页是20,第三页是40,这是理想状态,但真实不是这样。所以要从网页中提取下一页的链接,单线程,自己调用自己,每爬了几十分钟后报错maximum recursion depth exceeded in cmp,以为是个偶然,反复把脚本停了再重启了好几次都是这样,就找下答案了,上面是函数里面有个调用函数本身的代码,反复的调用自己,这种次数不能超过900,设置了sys.setrecursionlimit=100000果然好了。