2017-04-21 4 views
0

안녕하세요. 다음 코드를 사용하여 digg.com에서 전면 페이지 이미지를 긁어 봅니다. 문제는 0.jpg ~ 6.jpg가 정상입니다. 7.jpg에서 47.jpg로 시작하는 것이 손상되었습니다. 이유를 모르겠다.긁힌 이미지가 손상되었습니다.

다음은 코드입니다. 여기 Github에서 : https://github.com/kenpeter/py_mm

# os 
import os 
# http request 
import requests 
# 
import pprint 

import time 

# import html from lxml 
from lxml import html 

# global 
global_page_num = 0 
pp = pprint.PrettyPrinter(indent=4) 

# write to file 
def download_image(img_urls): 
    # total img urls 
    amount = len(img_urls) 

    # loop 
    for index, value in enumerate(img_urls, start=0): 
     # file name 
     filename = 'img/%s.jpg' % (index) 
     # dir 
     os.makedirs(os.path.dirname(filename), exist_ok=True) 

     print('--- start ---') 
     print('filename: %s' % filename) 
     print('Downloading: %s out of %s' % (index, amount)) 

     # open file 
     with open(filename, 'wb') as f: 
      # f write 
      # time.sleep(1) 
      f.write(requests.get(value).content) 


def get_page_number(num): 
    url = 'http://digg.com' 
    response = requests.get(url).content 
    selector = html.fromstring(response) 

    img_urls = [] 
    img_urls = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@src") 

    news_texts = [] 
    news_texts = selector.xpath("//div[@itemprop='description']/text()") 

    # test 
    # print('--- something ---') 
    # pp.pprint(img_urls) 
    # pp.pprint(news_texts) 

    download_image(img_urls) 

    return img_urls 


if __name__ == '__main__': 
    # input, page_number, everything into the var 
    # page_number = input('Please enter the page number that you want to scrape:') 

    # global_page_num 
    # global_page_num = page_number; 
    # print('hell world!'); 

    page_number = 4 # hardcode 
    get_page_number(page_number) 

답변

0

이미지가 "손상"왜 그 이유는 페이지 내에서 계획 변경 및 이미지가 시작하는 것이 당신이 당신의 코드를 잡아 컨텐츠 data-src 대신 src의 속성에서 "숨기기"에 . 두 속성 여기 캡처 한 페이지의 소스 코드의 예를 참조하십시오 : 당신은 둘 다 이미지 URL의 목록을 작성하는 동안 srcdata-srcsrc 이상 data-src 우선 순위를주는 속성을 확인해야 즉

<img 
class="digg-story__image-img js--digg-story__image-img lazy-image-img need-offset" 
data-src="http://static.digg.com/images/f0b92c2d8a2c4b7f829abbc0e58a408c_2oijd0Z_1_www_large_thumb.jpeg" 
src="http://static.digg.com/static/fe/944294/images/x_455x248.png" 
width="312" 
height="170" 
alt="" 
/> 

합니다.

이 코드는 "속임수"를 수행하고 적절한 이미지 다운로드 :

# os 
import os 
# http request 
import requests 
# 
import pprint 

import time 

# import html from lxml 
from lxml import html 

# global 
global_page_num = 0 
pp = pprint.PrettyPrinter(indent=4) 

# write to file 
def download_image(img_urls): 
    # total img urls 
    amount = len(img_urls) 

    # loop 
    for index, value in enumerate(img_urls, start=0): 
     # file name 
     filename = 'img/%s.jpg' % (index) 
     # dir 
     os.makedirs(os.path.dirname(filename), exist_ok=True) 

     print('--- start ---') 
     print('filename: %s' % filename) 
     print('Downloading: %s out of %s' % (index, amount)) 

     # open file 
     with open(filename, 'wb') as f: 
      # f write 
      # time.sleep(1) 
      f.write(requests.get(value).content) 


def get_page_number(num): 
    url = 'http://digg.com' 
    response = requests.get(url).content 
    selector = html.fromstring(response) 

    img_urls = [] 
    img_urls_1a = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@src") 
    img_urls_1b = [item for item in img_urls_1a if 'x_455x248.png' not in item] 
    img_urls_2 = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@data-src") 
    img_urls = img_urls_1b + img_urls_2 
    # print(img_urls) 
    news_texts = [] 
    news_texts = selector.xpath("//div[@itemprop='description']/text()") 

    # test 
    # print('--- something ---') 
    # pp.pprint(img_urls) 
    # pp.pprint(news_texts) 

    download_image(img_urls) 

    return img_urls 


if __name__ == '__main__': 
    # input, page_number, everything into the var 
    # page_number = input('Please enter the page number that you want to scrape:') 

    # global_page_num 
    # global_page_num = page_number; 
    # print('hell world!'); 

    page_number = 4 # hardcode 
    get_page_number(page_number)