내가 치료법을 사용하여 웹 사이트에서 필요로하는 데이터를 얻으려면 먼저 응답 객체를 만들어야합니다. 그래서 HtmlXpathSelector를 사용할 수 있습니다. HtmlXpathSelector는 URL 문자열을 인수로 허용하지 않습니다. 다음 코드 샘플에서는 "response2"변수가 비어 있기 때문에이를 수행하는 방법을 모른다.url 문자열에서 scrapy.http.Response를 인스턴스화하는 방법은 무엇입니까?
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from tarantula.items import OlgaItem
class OlgaSpider(CrawlSpider):
"""This crawler gets the physician's name and his homepage url."""
name = 'Olga'
DOWNLOAD_DELAY = 6 #para tentar evitar ser banido
#ROBOTSTXT_OBEY = True
#CONCURRENT_REQUESTS = 1
FEED_URI = '/home/mercutio22/gitcode/MedicWebsites.csv'
FEED_FORMAT = 'csv'
USER_AGENT = "Googlebot/2.1 (http://www.google.com/bot.html)"
#allowed_domains = ['guiareunimedicos.med.br']
start_urls = (
'http://medial-saude.guiareunimedicos.med.br/index.pl?act=searc\
h&_id_=172&_ev_=Submit&_formSearchSubmit=%3Adefault%3A&type=0&country=0\
&q=oncologia#results/',
'http://www.guiareunimedicos.med.br/index.pl?act=search&_id_=17\
#2&_ev_=Submit&_formSearchSubmit=%3Adefault%3A&type=0&country=0&q=cancer\
#ologia#results/')
rules = (
Rule(SgmlLinkExtractor(allow=r"V=", restrict_xpaths='//a[text()=">"]'),
callback='parse_item', follow=True),
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
mdata = hxs.select('//div[contains(@class, "mdata")]')
links = mdata.select('./a/@href').extract()
names = mdata.select('./a/text()').extract()
items = []
for index in range(len(names)):
i = OlgaItem()
i['name'] = names[index]
i['link'] = links[index]
response2 =
hxs2 = HtmlXPathSelector(response2) ###
name = hxs2.select('//big/text()').extract()
i['clinics'] = hxs2.select('//h2/a/text()').extract()
data = hxs2.select('//div[contains(@class, "stab data")]')
addresses = [ x.select('./p/text()').extract() for x in data ]
addresses = [ ''.join(x) for x in addresses ]
addresses = [ x.replace('Telefone(s): \r\n\r\n\r\n', '') for x in addresses ]
addresses = [ x[2:] for x in addresses ]
i['addresses'] = addresses
i['phones'] = hxs2.select('//span[@id]/text()').extract()
items.append(i)
return items