내 요청이 웹 사이트에 의해 중단되고 프록시를 설정해야하는지 궁금합니다. 먼저 http 연결을 닫으려고했습니다. 실패했습니다. 나는 또한 내 코드를 테스트하려고하지만 지금은 출력이없는 것 같습니다. 내가 모든 것을 OK로 사용할 프록시를 사용합니까? 다음은 코드입니다.Python HTTPConnectionPool 새 연결을 설정하지 못했습니다 : [Errno 11004] getaddrinfo failed
import requests
from urllib.parse import urlencode
import json
from bs4 import BeautifulSoup
import re
from html.parser import HTMLParser
from multiprocessing import Pool
from requests.exceptions import RequestException
import time
def get_page_index(offset, keyword):
#headers = {'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}
data = {
'offset': offset,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': 20,
'cur_tab': 1
}
url = 'http://www.toutiao.com/search_content/?' + urlencode(data)
try:
response = requests.get(url, headers={'Connection': 'close'})
response.encoding = 'utf-8'
if response.status_code == 200:
return response.text
return None
except RequestException as e:
print(e)
def parse_page_index(html):
data = json.loads(html)
if data and 'data' in data.keys():
for item in data.get('data'):
url = item.get('article_url')
if url and len(url) < 100:
yield url
def get_page_detail(url):
try:
response = requests.get(url, headers={'Connection': 'close'})
response.encoding = 'utf-8'
if response.status_code == 200:
return response.text
return None
except RequestException as e:
print(e)
def parse_page_detail(html):
soup = BeautifulSoup(html, 'lxml')
title = soup.select('title')[0].get_text()
pattern = re.compile(r'articleInfo: (.*?)},', re.S)
pattern_abstract = re.compile(r'abstract: (.*?)\.', re.S)
res = re.search(pattern, html)
res_abstract = re.search(pattern_abstract, html)
if res and res_abstract:
data = res.group(1).replace(r".replace(/<br \/>|\n|\r/ig, '')", "") + '}'
abstract = res_abstract.group(1).replace(r"'", "")
content = re.search(r'content: (.*?),', data).group(1)
source = re.search(r'source: (.*?),', data).group(1)
time_pattern = re.compile(r'time: (.*?)}', re.S)
date = re.search(time_pattern, data).group(1)
date_today = time.strftime('%Y-%m-%d')
img = re.findall(r'src="(.*?)"', content)
if date[1:11] == date_today and len(content) > 50 and img:
return {
'title': title,
'content': content,
'source': source,
'date': date,
'abstract': abstract,
'img': img[0]
}
def main(offset):
flag = 1
html = get_page_index(offset, '光伏')
for url in parse_page_index(html):
html = get_page_detail(url)
if html:
data = parse_page_detail(html)
if data:
html_parser = HTMLParser()
cwl = html_parser.unescape(data.get('content'))
data['content'] = cwl
print(data)
print(data.get('img'))
flag += 1
if flag == 5:
break
if __name__ == '__main__':
pool = Pool()
pool.map(main, [i*20 for i in range(10)])
여기에 오류가 있습니다!
HTTPConnectionPool(host='tech.jinghua.cn', port=80): Max retries exceeded with url: /zixun/20160720/f191549.shtml (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x00000000048523C8>: Failed to establish a new connection: [Errno 11004] getaddrinfo failed',))
그런데 처음에는 코드를 테스트 할 때 모든 것이 정상임을 나타냅니다! 미리 감사드립니다.
코드를 다시 테스트합니다. 출력을 얻지 만 HTTPConnectionPool 오류가 발생하면 중단됩니다. 이 중단을 해결하십시오. 감사합니다. – cwl