web scraping - Scrapy: how to return a new request after request.meta['proxy'] is changed? -


i'm crawling pages few proxies, of proxies don't work ,

debug: crawled (403) <get http://xiyuanxiaoqu0571.fang.com/xiangqing/> 

or

debug: crawled (302) <get http://yilexincun.fang.com/xiangqing/>  

below spider.py , in parse_community() tried recrawl pages response.status not 200 doesn't seem work.

any appreciated!

besides, how can exclude bad proxy ips caused "http 302/403" when running crawler?

#-*- coding=utf8 -*-  import scrapy scrapy.contrib.spiders import crawlspider, rule scrapy.contrib.linkextractors import linkextractor soufang.items import community_info scrapy import log import sys  import random soufang.misc.proxy import proxies imp import reload reload(sys) sys.setdefaultencoding( "utf-8" )   class soufangspider(crawlspider):     name = 'soufang'     allowed_domains = ['fang.com']     start_urls = ['http://esf.hz.fang.com/housing/151_2352_1_0_0_0_1_0_0/']      rules = (              rule(linkextractor(allow=('/xiangqing/$'),deny=    ('/\d.+\.html')),callback='parse_community'),              rule(linkextractor(allow=('/xiangqing/$'),deny=('/\d.+\.html'),restrict_xpaths = (u"//div[@class='info rel floatl ml15']/dl/dd[@id='detail_6']")),follow=true),              rule(linkextractor(deny=('/\d.+\.html'),restrict_xpaths = u"//a[text()='下一页']"))          )      handle_httpstatus_list = [302,404,503,403]         def parse_community(self,response):          if response.status != 200 :             print response.url            request = scrapy.request(response.url)            p = random.choice(proxies)            request.meta['proxy'] = "http://%s" % p            return request           item = community_info()          temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/div[@class='ewmboxtitle']/span[@class='floatl']/text()").extract()         item['community'] = temp[0] if temp else ''          item['city'] = ''          temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='开 发 商:']/../text()").extract()         item['developer'] = temp[0] if temp else ''          temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='所属区域:']/../text()").extract()         item['district'] = temp[0] if temp else ''          temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='小区地址:']/../text()").extract()         item['address'] = temp[0] if temp else ''          temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='邮${nbsp}${nbsp}${nbsp}${nbsp}编:']/../text()").extract()         item['postcode'] = temp[0] if temp else ''          temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='竣工时间:']/../text()").extract()         item['yearofdev'] = temp[0] if temp else ''          temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='firstpic']/dd[text()='本月均价:']/span[1]/text()").extract()         item['price'] = temp[0] if temp else ''          temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='总 户 数:']/../text()").extract()         item['household_no'] = temp[0] if temp else ''          temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='物业类别:']/../text()").extract()         item['community_type'] = temp[0] if temp else ''          temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='物 业 费:']/../text()").extract()         item['property_fee'] = temp[0] if temp else ''          temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='建筑面积:']/../text()").extract()         item['total_area'] = temp[0] if temp else ''          temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='占地面积:']/../text()").extract()         item['area'] = temp[0] if temp else ''          temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='绿 化 率:']/../text()").extract()         item['greening_rate'] = temp[0] if temp else ''          temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='容 积 率:']/../text()").extract()         item['volumn_rate'] = temp[0] if temp else ''          temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/div[@class='yihang']/h3[text()='交通状况']/../following-sibling::dl[1]/dt[1]/text()").extract()         item['transportation'] = temp[0] if temp else ''          temp = "".join(response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/div[@class='yihang']/h3[text()='周边信息']/../following-sibling::dl[1]//text()").extract())         item['periphery'] = temp if temp else ''           log.msg(':'.join([response.url,item['community']]),level=log.info)          return item 


Comments

Popular posts from this blog

android - MPAndroidChart - How to add Annotations or images to the chart -

javascript - Add class to another page attribute using URL id - Jquery -

firefox - Where is 'webgl.osmesalib' parameter? -