0
안녕하세요. 다음 코드를 사용하여 digg.com에서 전면 페이지 이미지를 긁어 봅니다. 문제는 0.jpg ~ 6.jpg가 정상입니다. 7.jpg에서 47.jpg로 시작하는 것이 손상되었습니다. 이유를 모르겠다.긁힌 이미지가 손상되었습니다.
다음은 코드입니다. 여기 Github에서 : https://github.com/kenpeter/py_mm
# os
import os
# http request
import requests
#
import pprint
import time
# import html from lxml
from lxml import html
# global
global_page_num = 0
pp = pprint.PrettyPrinter(indent=4)
# write to file
def download_image(img_urls):
# total img urls
amount = len(img_urls)
# loop
for index, value in enumerate(img_urls, start=0):
# file name
filename = 'img/%s.jpg' % (index)
# dir
os.makedirs(os.path.dirname(filename), exist_ok=True)
print('--- start ---')
print('filename: %s' % filename)
print('Downloading: %s out of %s' % (index, amount))
# open file
with open(filename, 'wb') as f:
# f write
# time.sleep(1)
f.write(requests.get(value).content)
def get_page_number(num):
url = 'http://digg.com'
response = requests.get(url).content
selector = html.fromstring(response)
img_urls = []
img_urls = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@src")
news_texts = []
news_texts = selector.xpath("//div[@itemprop='description']/text()")
# test
# print('--- something ---')
# pp.pprint(img_urls)
# pp.pprint(news_texts)
download_image(img_urls)
return img_urls
if __name__ == '__main__':
# input, page_number, everything into the var
# page_number = input('Please enter the page number that you want to scrape:')
# global_page_num
# global_page_num = page_number;
# print('hell world!');
page_number = 4 # hardcode
get_page_number(page_number)