web scraping - Scrapy: how to return a new request after request.meta['proxy'] is changed? -
i'm crawling pages few proxies, of proxies don't work ,
debug: crawled (403) <get http://xiyuanxiaoqu0571.fang.com/xiangqing/>
or
debug: crawled (302) <get http://yilexincun.fang.com/xiangqing/>
below spider.py , in parse_community() tried recrawl pages response.status not 200 doesn't seem work.
any appreciated!
besides, how can exclude bad proxy ips caused "http 302/403" when running crawler?
#-*- coding=utf8 -*- import scrapy scrapy.contrib.spiders import crawlspider, rule scrapy.contrib.linkextractors import linkextractor soufang.items import community_info scrapy import log import sys import random soufang.misc.proxy import proxies imp import reload reload(sys) sys.setdefaultencoding( "utf-8" ) class soufangspider(crawlspider): name = 'soufang' allowed_domains = ['fang.com'] start_urls = ['http://esf.hz.fang.com/housing/151_2352_1_0_0_0_1_0_0/'] rules = ( rule(linkextractor(allow=('/xiangqing/$'),deny= ('/\d.+\.html')),callback='parse_community'), rule(linkextractor(allow=('/xiangqing/$'),deny=('/\d.+\.html'),restrict_xpaths = (u"//div[@class='info rel floatl ml15']/dl/dd[@id='detail_6']")),follow=true), rule(linkextractor(deny=('/\d.+\.html'),restrict_xpaths = u"//a[text()='下一页']")) ) handle_httpstatus_list = [302,404,503,403] def parse_community(self,response): if response.status != 200 : print response.url request = scrapy.request(response.url) p = random.choice(proxies) request.meta['proxy'] = "http://%s" % p return request item = community_info() temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/div[@class='ewmboxtitle']/span[@class='floatl']/text()").extract() item['community'] = temp[0] if temp else '' item['city'] = '' temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='开 发 商:']/../text()").extract() item['developer'] = temp[0] if temp else '' temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='所属区域:']/../text()").extract() item['district'] = temp[0] if temp else '' temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='小区地址:']/../text()").extract() item['address'] = temp[0] if temp else '' temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='邮${nbsp}${nbsp}${nbsp}${nbsp}编:']/../text()").extract() item['postcode'] = temp[0] if temp else '' temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='竣工时间:']/../text()").extract() item['yearofdev'] = temp[0] if temp else '' temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='firstpic']/dd[text()='本月均价:']/span[1]/text()").extract() item['price'] = temp[0] if temp else '' temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='总 户 数:']/../text()").extract() item['household_no'] = temp[0] if temp else '' temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='物业类别:']/../text()").extract() item['community_type'] = temp[0] if temp else '' temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='物 业 费:']/../text()").extract() item['property_fee'] = temp[0] if temp else '' temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='建筑面积:']/../text()").extract() item['total_area'] = temp[0] if temp else '' temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='占地面积:']/../text()").extract() item['area'] = temp[0] if temp else '' temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='绿 化 率:']/../text()").extract() item['greening_rate'] = temp[0] if temp else '' temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/dl[@class='lbox']/dd/strong[text()='容 积 率:']/../text()").extract() item['volumn_rate'] = temp[0] if temp else '' temp = response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/div[@class='yihang']/h3[text()='交通状况']/../following-sibling::dl[1]/dt[1]/text()").extract() item['transportation'] = temp[0] if temp else '' temp = "".join(response.xpath(u"//div[@class='maininfo']/div[@class='leftinfo']/div[@class='yihang']/h3[text()='周边信息']/../following-sibling::dl[1]//text()").extract()) item['periphery'] = temp if temp else '' log.msg(':'.join([response.url,item['community']]),level=log.info) return item
Comments
Post a Comment