2016-10-08 5 views
1

웹 사이트에서 세부 정보 페이지의 이미지를 가져 오려고합니다. 링크를 얻으려면 rss 'links'함수를 사용하고 있습니다. 이것은 내 코드이미지의 세부 정보 페이지를 다듬 으면 오류가 발생합니다.

@app.task 
def pan_task(): 
    url = 'http://feeds.example.com/reuters/technologyNews' 
    name = 'noticiassin' 
    live_leaks = [i for i in feedparser.parse(url).entries][:10] 
    the_count = len(live_leaks) 
    ky = feedparser.parse(url).keys() 
    oky = [i.keys() for i in feedparser.parse(url).entries][1] # shows what I can pull 

    def make_soup(url): 
     def swappo(): 
      user_one = ' "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0" ' 
      user_two = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)" ' 
      user_thr = ' "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko" ' 
      user_for = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0" ' 

      agent_list = [user_one, user_two, user_thr, user_for] 
      a = random.choice(agent_list) 
      return a 
     headers = { 
      "user-agent": swappo(), 
      "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 
      "accept-charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3", 
      "accept-encoding": "gzip,deflate,sdch", 
      "accept-language": "en-US,en;q=0.8", 
     } 
     the_comments_page = requests.get(url, headers=headers) 
     soupdata = BeautifulSoup(the_comments_page.text, 'html5lib') 
     # comment = soupdata.find('a').get('src') 
     # para = comment.find_all('p') 
     # kids = [child.text for child in para] 
     # blu = str(kids).strip('[]') 
     return soupdata 

    try: 
     live_entries = [{'href': live_leak.links[0]['href']} for live_leak in live_leaks] 
     o = make_soup(live_entries) 
    except IndexError: 
     print('error check logs') 
     live_entries = [] 

    return print(o) 

하지만 내가하려고 할 때이 작동하지 whhy이 오류

[2016-10-07 21:10:58,019: ERROR/MainProcess] Task blog.tasks.pan_task[f43ed360-c06e-4a4b-95ab-4f44a4564afa] raised unexpected: InvalidSchema("No connection adapters were found for '[{'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/AA1uAIpygjQ/us-apple-samsung-elec-appeal-idUSKCN1271LF'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/Nz28cqiuS0Y/us-google-pixel-advertising-idUSKCN12721U'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/POLoFj22hc4/us-yahoo-nsa-order-idUSKCN12800D'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/eF-XlhlQl-s/us-fcc-dataservices-idUSKCN1271RB'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/hNf9IQ3rXjw/us-autonomous-nauto-idUSKCN1271FX'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/NXkk5WfWVhM/us-sony-sensors-idUSKCN1270EC'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/gdBvoarqQro/us-yahoo-discrimination-lawsuit-idUSKCN12800K'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/nt8K--27bDg/us-thomsonreuters-ceo-idUSKCN1271DQ'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/f8z3eQg2Fpo/us-snapchat-ipo-idUSKCN12627S'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/rr4vdLsC11Y/us-samsung-elec-results-idUSKCN1262NO'}]'",) 
Traceback (most recent call last): 
    File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/celery/app/trace.py", line 240, in trace_task 
    R = retval = fun(*args, **kwargs) 
    File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/celery/app/trace.py", line 438, in __protected_call__ 
    return self.run(*args, **kwargs) 
    File "/Users/ray/Desktop/myheroku/practice/src/blog/tasks.py", line 134, in pan_task 
    o = make_soup(live_entries) 
    File "/Users/ray/Desktop/myheroku/practice/src/blog/tasks.py", line 124, in make_soup 
    the_comments_page = requests.get(url, headers=headers) 
    File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/requests/api.py", line 67, in get 
    return request('get', url, params=params, **kwargs) 
    File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/requests/api.py", line 53, in request 
    return session.request(method=method, url=url, **kwargs) 
    File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/requests/sessions.py", line 468, in request 
    resp = self.send(prep, **send_kwargs) 
    File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/requests/sessions.py", line 570, in send 
    adapter = self.get_adapter(url=request.url) 
    File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/requests/sessions.py", line 644, in get_adapter 
    raise InvalidSchema("No connection adapters were found for '%s'" % url) 
requests.exceptions.InvalidSchema: No connection adapters were found for '[{'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/AA1uAIpygjQ/us-apple-samsung-elec-appeal-idUSKCN1271LF'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/Nz28cqiuS0Y/us-google-pixel-advertising-idUSKCN12721U'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/POLoFj22hc4/us-yahoo-nsa-order-idUSKCN12800D'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/eF-XlhlQl-s/us-fcc-dataservices-idUSKCN1271RB'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/hNf9IQ3rXjw/us-autonomous-nauto-idUSKCN1271FX'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/NXkk5WfWVhM/us-sony-sensors-idUSKCN1270EC'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/gdBvoarqQro/us-yahoo-discrimination-lawsuit-idUSKCN12800K'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/nt8K--27bDg/us-thomsonreuters-ceo-idUSKCN1271DQ'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/f8z3eQg2Fpo/us-snapchat-ipo-idUSKCN12627S'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/rr4vdLsC11Y/us-samsung-elec-results-idUSKCN1262NO'}]' 

를 얻을? 나는 다른 프로그램에서이 기능을 사용한다.

+1

요청이 하나의 URL을한다, 그러나 당신은 그것을 사전의 목록을 전달하고 있습니다. – miah

+0

비슷한 프로그램에서이 기능을 사용했습니다. 유일한 차이점은 피드 구문 분석기를 사용하여 URL을 얻지 못한다는 것입니다. 어떻게 작동시킬 수 있습니까? – losee

답변

-1

당신은 같은 것을 할 필요가 :

@app.task 
def pan_task(): 
    url = 'http://feeds.example.com/reuters/technologyNews' 
    name = 'noticiassin' 
    live_leaks = [i for i in feedparser.parse(url).entries][:10] 
    the_count = len(live_leaks) 
    ky = feedparser.parse(url).keys() 
    oky = [i.keys() for i in feedparser.parse(url).entries][1] # shows what I can pull 

    def make_soup(url): 
     def swappo(): 
      user_one = ' "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0" ' 
      user_two = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)" ' 
      user_thr = ' "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko" ' 
      user_for = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0" ' 

      agent_list = [user_one, user_two, user_thr, user_for] 
      a = random.choice(agent_list) 
      return a 
     headers = { 
      "user-agent": swappo(), 
      "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 
      "accept-charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3", 
      "accept-encoding": "gzip,deflate,sdch", 
      "accept-language": "en-US,en;q=0.8", 
     } 
     the_comments_page = requests.get(url, headers=headers) 
     soupdata = BeautifulSoup(the_comments_page.text, 'html5lib') 
     # comment = soupdata.find('div') 
     # para = comment.find_all('p') 
     # kids = [child.text for child in para] 
     # blu = str(kids).strip('[]') 
     return soupdata 

    live_entries = [] 
    try: 
     for live_leak in live_leaks: 
      live_entries.append(make_soup(live_leak.links[0]['href'])) 
      # Do what ever you need to do to o here 
    except IndexError: 
     print('error check logs') 
     live_entries = [] 
    return live_entries 
+0

그냥 빈 목록을 반환합니다 [] – losee

+0

무엇을 반환 하시겠습니까? – miah

+0

세부 정보 페이지의 src를 잡아서 사용할 수 있습니다. – losee