2011-11-06 7 views
1

내가 치료법을 사용하여 웹 사이트에서 필요로하는 데이터를 얻으려면 먼저 응답 객체를 만들어야합니다. 그래서 HtmlXpathSelector를 사용할 수 있습니다. HtmlXpathSelector는 URL 문자열을 인수로 허용하지 않습니다. 다음 코드 샘플에서는 "response2"변수가 비어 있기 때문에이를 수행하는 방법을 모른다.url 문자열에서 scrapy.http.Response를 인스턴스화하는 방법은 무엇입니까?

from scrapy.selector import HtmlXPathSelector 
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 
from scrapy.contrib.spiders import CrawlSpider, Rule 
from tarantula.items import OlgaItem 

class OlgaSpider(CrawlSpider): 
    """This crawler gets the physician's name and his homepage url.""" 

    name = 'Olga' 
    DOWNLOAD_DELAY = 6 #para tentar evitar ser banido 
    #ROBOTSTXT_OBEY = True 
    #CONCURRENT_REQUESTS = 1 
    FEED_URI = '/home/mercutio22/gitcode/MedicWebsites.csv' 
    FEED_FORMAT = 'csv' 
    USER_AGENT = "Googlebot/2.1 (http://www.google.com/bot.html)" 
    #allowed_domains = ['guiareunimedicos.med.br'] 
    start_urls = (
    'http://medial-saude.guiareunimedicos.med.br/index.pl?act=searc\ 
    h&_id_=172&_ev_=Submit&_formSearchSubmit=%3Adefault%3A&type=0&country=0\ 
    &q=oncologia#results/', 
    'http://www.guiareunimedicos.med.br/index.pl?act=search&_id_=17\ 
    #2&_ev_=Submit&_formSearchSubmit=%3Adefault%3A&type=0&country=0&q=cancer\ 
    #ologia#results/') 
    rules = (
    Rule(SgmlLinkExtractor(allow=r"V=", restrict_xpaths='//a[text()=">"]'), 
    callback='parse_item', follow=True), 
    ) 

    def parse_item(self, response): 

     hxs = HtmlXPathSelector(response) 
     mdata = hxs.select('//div[contains(@class, "mdata")]') 
     links = mdata.select('./a/@href').extract() 
     names = mdata.select('./a/text()').extract() 

     items = [] 
     for index in range(len(names)): 
      i = OlgaItem() 
      i['name'] = names[index] 
      i['link'] = links[index] 
      response2 = 
      hxs2 = HtmlXPathSelector(response2) ### 
      name = hxs2.select('//big/text()').extract() 
      i['clinics'] = hxs2.select('//h2/a/text()').extract() 
      data = hxs2.select('//div[contains(@class, "stab data")]') 
      addresses = [ x.select('./p/text()').extract() for x in data ] 
      addresses = [ ''.join(x) for x in addresses ] 
      addresses = [ x.replace('Telefone(s): \r\n\r\n\r\n', '') for x in addresses ] 
      addresses = [ x[2:] for x in addresses ] 
      i['addresses'] = addresses 
      i['phones'] = hxs2.select('//span[@id]/text()').extract() 

      items.append(i) 
     return items 

답변

3
당신은 또한 그냥 HTML로 제공하여 HtmlXPathSelector를 만들 수 있습니다

: 예를 들어,

hxs = HtmlXPathSelector(text= '<div>blah-blah</div>')