0
그래서이 상황을 어떻게 처리해야할지 모르겠습니다. 거의 많은 다른 깨진 링크에서 작동하지만이 하나 내가 링크 http://cutearoo.com/wp-content/uploads/2011/04/Pomsky.png 에 붙어 및 CTL + C 그것을 가지고 얻을더 이상 존재하지 않는 링크에서 이미지 다운로드가 중단됩니다.
import datetime
import praw
import re
import urllib
import requests
from bs4 import BeautifulSoup
sub = 'dog'
imgurUrlPattern = re.compile(r'(http://i.imgur.com/(.*))(\?.*)?')
r = praw.Reddit(user_agent = "download all images from a subreddit",
user_site = "lamiastella")
already_done = []
#checkWords = ['i.imgur.com', 'jpg', 'png',]
check_words = ['jpg', 'jpeg', 'png']
subreddit = r.get_subreddit(sub)
for submission in subreddit.get_top_from_all(limit=10000):
#for submission in subreddit.get_hot(limit=10000):
is_image = any(string in submission.url for string in check_words)
print '[LOG] Getting url: ' + submission.url
if submission.id not in already_done and is_image:
if submission.url.endswith('/'):
modified_url = submission.url[:len(submission.url)-1]
try:
urllib.urlretrieve(modified_url, '/home/jalal/computer_vision/image_retrieval/images/' + datetime.datetime.now().strftime('%y-%m-%d-%s') + modified_url[-5:])
except Exception as e:
print(e)
#pass
continue
else:
try:
urllib.urlretrieve(submission.url, '/home/jalal/computer_vision/image_retrieval/images/' + datetime.datetime.now().strftime('%y-%m-%d-%s') + submission.url[-5:])
except Exception as e:
print(e)
#pass
continue
already_done.append(submission.id)
print '[LOG] Done Getting ' + submission.url
print('{0}: {1}'.format('submission id is', submission.id))
elif 'imgur.com' in submission.url and not (submission.url.endswith('gif')
or submission.url.endswith('webm')
or submission.url.endswith('mp4')
or submission.url.endswith('all')
or '#' in submission.url
or '/a/' in submission.url):
# This is an Imgur page with a single image.
html_source = requests.get(submission.url).text # download the image's page
soup = BeautifulSoup(html_source, "lxml")
image_url = soup.select('img')[0]['src']
if image_url.startswith('//'):
# if no schema is supplied in the url, prepend 'http:' to it
image_url = 'http:' + image_url
image_id = image_url[image_url.rfind('/') + 1:image_url.rfind('.')]
urllib.urlretrieve(image_url, '/home/jalal/computer_vision/image_retrieval/images/' + 'imgur_'+ datetime.datetime.now().strftime('%y-%m-%d-%s') + submission.url[-9:0])
elif 'instagram.com' in submission.url:
html_source = requests.get(submission.url).text
soup = BeautifulSoup(html_source, "lxml")
instagram_url = soup.find('meta', {"property":"og:image"})['content']
urllib.urlretrieve(instagram_url, '/home/jalal/computer_vision/image_retrieval/images/' + 'instagram_'+ datetime.datetime.now().strftime('%y-%m-%d-%s') + '.jpg')
else:
continue
: 이것에 대한
[LOG] Done Getting http://i.imgur.com/Vc9P9QC.jpg
submission id is: 1fv70j
[LOG] Getting url: http://i.imgur.com/iOBi0qx.jpg
[LOG] Done Getting http://i.imgur.com/iOBi0qx.jpg
submission id is: 1dof3o
[LOG] Getting url: http://cutearoo.com/wp-content/uploads/2011/04/Pomsky.png
^CTraceback (most recent call last):
File "download_images.py", line 35, in <module>
urllib.urlretrieve(submission.url, '/home/jalal/computer_vision/image_retrieval/images/' + datetime.datetime.now().strftime('%y-%m-%d-%s') + submission.url[-5:])
File "/usr/lib/python2.7/urllib.py", line 98, in urlretrieve
return opener.retrieve(url, filename, reporthook, data)
File "/usr/lib/python2.7/urllib.py", line 245, in retrieve
fp = self.open(url, data)
File "/usr/lib/python2.7/urllib.py", line 213, in open
return getattr(self, name)(url)
File "/usr/lib/python2.7/urllib.py", line 350, in open_http
h.endheaders(data)
File "/usr/lib/python2.7/httplib.py", line 1053, in endheaders
self._send_output(message_body)
File "/usr/lib/python2.7/httplib.py", line 897, in _send_output
self.send(msg)
File "/usr/lib/python2.7/httplib.py", line 859, in send
self.connect()
File "/usr/lib/python2.7/httplib.py", line 836, in connect
self.timeout, self.source_address)
File "/usr/lib/python2.7/socket.py", line 566, in create_connection
sock.connect(sa)
File "/usr/lib/python2.7/socket.py", line 228, in meth
return getattr(self._sock,name)(*args)
KeyboardInterrupt
제안하십시오 수정.
UPDATE
:image_file = urllib2.urlopen(modified_url)
with open('/home/jalal/computer_vision/image_retrieval/images/' + datetime.datetime.now().strftime('%y-%m-%d-%s') + modified_url[-5:], 'wb') as output_image:
output_image.write(image_file.read())
여전히이 특정 링크에 대한 박히 : 내가 좋아하는 뭔가를 사용했다.
가능한 복제 [타임 아웃 파이썬 URLLIB와 파일 다운로드?] (http://stackoverflow.com/questions/32763720/timeout-a-file-download-with-python-urllib) – zvone
@zvone하십시오 업데이트를 확인하십시오! –