이미지 집합이 있는데 데이터를 ID로 해시하고 싶습니다. PIL/Pillow로 픽셀의 하위 집합을 읽으시겠습니까?
은 현재 내가이 일을하고있다 :import hashlib
import uuid
def get_image_uuid(pil_img):
# Read PIL image data
img_bytes_ = pil_img.tobytes()
# hash the bytes using sha1
bytes_sha1 = hashlib.sha1(img_bytes_)
hashbytes_20 = bytes_sha1.digest()
# sha1 produces 20 bytes, but UUID requires 16 bytes
hashbytes_16 = hashbytes_20[0:16]
uuid_ = uuid.UUID(bytes=hashbytes_16)
return uuid_
이것은 결정적 16 바이트 UUID 해시에 대한 잔인한 이미지, 모든 픽셀 데이터를 읽습니다.
이런 식으로 할 방법이 있습니까?
img_bytes = pil_img.tobytes(stride=16)
편집 : 나는이 스크립트를 사용하여 일부 세부 타이밍 결과를 생산했다. 사용하고있는 이미지가 큽니다 (약 6MB). 여기
from __future__ import absolute_import, division, print_function
import __builtin__
import time
import timeit
from PIL import Image
import hashlib
import numpy as np
import uuid
# My data getters
from vtool.tests import grabdata
elephant = grabdata.get_testimg_path('elephant.jpg')
lena = grabdata.get_testimg_path('lena.jpg')
zebra = grabdata.get_testimg_path('zebra.jpg')
jeff = grabdata.get_testimg_path('jeff.png')
gpath = elephant
try:
getattr(__builtin__, 'profile')
__LINE_PROFILE__ = True
except AttributeError:
__LINE_PROFILE__ = False
def profile(func):
return func
@profile
def get_image_uuid(img_bytes_):
# hash the bytes using sha1
bytes_sha1 = hashlib.sha1(img_bytes_)
hashbytes_20 = bytes_sha1.digest()
# sha1 produces 20 bytes, but UUID requires 16 bytes
hashbytes_16 = hashbytes_20[0:16]
uuid_ = uuid.UUID(bytes=hashbytes_16)
return uuid_
@profile
def make_uuid_PIL_bytes(gpath):
pil_img = Image.open(gpath, 'r')
# Read PIL image data
img_bytes_ = pil_img.tobytes()
uuid_ = get_image_uuid(img_bytes_)
return uuid_
@profile
def make_uuid_NUMPY_bytes(gpath):
pil_img = Image.open(gpath, 'r')
# Read PIL image data
np_img = np.asarray(pil_img)
np_flat = np_img.ravel()
img_bytes_ = np_flat.tostring()
uuid_ = get_image_uuid(img_bytes_)
return uuid_
@profile
def make_uuid_NUMPY_STRIDE_16_bytes(gpath):
pil_img = Image.open(gpath, 'r')
# Read PIL image data
np_img = np.asarray(pil_img)
np_flat = np_img.ravel()[::16]
img_bytes_ = np_flat.tostring()
uuid_ = get_image_uuid(img_bytes_)
return uuid_
@profile
def make_uuid_NUMPY_STRIDE_64_bytes(gpath):
pil_img = Image.open(gpath, 'r')
# Read PIL image data
img_bytes_ = np.asarray(pil_img).ravel()[::64].tostring()
uuid_ = get_image_uuid(img_bytes_)
return uuid_
@profile
def make_uuid_CONTIG_NUMPY_bytes(gpath):
pil_img = Image.open(gpath, 'r')
# Read PIL image data
np_img = np.asarray(pil_img)
np_flat = np_img.ravel().tostring()
np_contig = np.ascontiguousarray(np_flat)
img_bytes_ = np_contig.tostring()
uuid_ = get_image_uuid(img_bytes_)
return uuid_
@profile
def make_uuid_CONTIG_NUMPY_STRIDE_16_bytes(gpath):
pil_img = Image.open(gpath, 'r')
# Read PIL image data
np_img = np.asarray(pil_img)
np_contig = np.ascontiguousarray(np_img.ravel()[::16])
img_bytes_ = np_contig.tostring()
uuid_ = get_image_uuid(img_bytes_)
return uuid_
@profile
def make_uuid_CONTIG_NUMPY_STRIDE_64_bytes(gpath):
pil_img = Image.open(gpath, 'r')
# Read PIL image data
img_bytes_ = np.ascontiguousarray(np.asarray(pil_img).ravel()[::64]).tostring()
uuid_ = get_image_uuid(img_bytes_)
return uuid_
if __name__ == '__main__':
# cool trick
test_funcs = [
make_uuid_PIL_bytes,
make_uuid_NUMPY_bytes,
make_uuid_NUMPY_STRIDE_16_bytes,
make_uuid_NUMPY_STRIDE_64_bytes,
make_uuid_CONTIG_NUMPY_bytes,
make_uuid_CONTIG_NUMPY_STRIDE_16_bytes,
make_uuid_CONTIG_NUMPY_STRIDE_64_bytes,
]
func_strs = ', '.join([func.func_name for func in test_funcs])
setup = 'from __main__ import (gpath, %s) ' % (func_strs,)
number = 2
for func in test_funcs:
func_name = func.func_name
print('Running: %s' % func_name)
if __LINE_PROFILE__:
start = time.time()
for _ in xrange(number):
func(gpath)
total_time = time.time() - start
else:
stmt = '%s(gpath)' % func_name
total_time = timeit.timeit(stmt=stmt, setup=setup, number=number)
print('timed: %r seconds in %s' % (total_time, func_name))
은 창문 라인 프로파일 결과 : 여기
File: _timeits/time_uuids.py
Function: make_uuid_CONTIG_NUMPY_STRIDE_16_bytes at line 91
Total time: 1.03287 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
91 @profile
92 def make_uuid_CONTIG_NUMPY_STRIDE_16_bytes(gpath):
93 2 3571 1785.5 0.1 pil_img = Image.open(gpath, 'r')
94 # Read PIL image data
95 2 3310103 1655051.5 96.2 np_img = np.asarray(pil_img)
96 2 44833 22416.5 1.3 np_contig = np.ascontiguousarray(np_img.ravel()
[::16])
97 2 9657 4828.5 0.3 img_bytes_ = np_contig.tostring()
98 2 72560 36280.0 2.1 uuid_ = get_image_uuid(img_bytes_)
99 2 4 2.0 0.0 return uuid_
File: _timeits/time_uuids.py
Function: make_uuid_CONTIG_NUMPY_STRIDE_64_bytes at line 102
Total time: 1.0385 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
102 @profile
103 def make_uuid_CONTIG_NUMPY_STRIDE_64_bytes(gpath):
104 2 3285 1642.5 0.1 pil_img = Image.open(gpath, 'r')
105 # Read PIL image data
106 2 3436641 1718320.5 99.3 img_bytes_ = np.ascontiguousarray(np.asarray(p
il_img).ravel()[::64]).tostring()
107 2 19570 9785.0 0.6 uuid_ = get_image_uuid(img_bytes_)
108 2 4 2.0 0.0 return uuid_
File: _timeits/time_uuids.py
Function: make_uuid_NUMPY_STRIDE_64_bytes at line 70
Total time: 1.04175 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
70 @profile
71 def make_uuid_NUMPY_STRIDE_64_bytes(gpath):
72 2 3356 1678.0 0.1 pil_img = Image.open(gpath, 'r')
73 # Read PIL image data
74 2 3447197 1723598.5 99.3 img_bytes_ = np.asarray(pil_img).ravel()[::64]
.tostring()
75 2 19774 9887.0 0.6 uuid_ = get_image_uuid(img_bytes_)
76 2 4 2.0 0.0 return uuid_
File: _timeits/time_uuids.py
Function: make_uuid_NUMPY_STRIDE_16_bytes at line 59
Total time: 1.0913 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
59 @profile
60 def make_uuid_NUMPY_STRIDE_16_bytes(gpath):
61 2 3706 1853.0 0.1 pil_img = Image.open(gpath, 'r')
62 # Read PIL image data
63 2 3339663 1669831.5 91.9 np_img = np.asarray(pil_img)
64 2 112 56.0 0.0 np_flat = np_img.ravel()[::16]
65 2 217844 108922.0 6.0 img_bytes_ = np_flat.tostring()
66 2 74044 37022.0 2.0 uuid_ = get_image_uuid(img_bytes_)
67 2 4 2.0 0.0 return uuid_
File: _timeits/time_uuids.py
Function: get_image_uuid at line 28
Total time: 1.10141 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
28 @profile
29 def get_image_uuid(img_bytes_):
30 # hash the bytes using sha1
31 14 3665965 261854.6 99.9 bytes_sha1 = hashlib.sha1(img_bytes_)
32 14 326 23.3 0.0 hashbytes_20 = bytes_sha1.digest()
33 # sha1 produces 20 bytes, but UUID requires 16
bytes
34 14 75 5.4 0.0 hashbytes_16 = hashbytes_20[0:16]
35 14 2661 190.1 0.1 uuid_ = uuid.UUID(bytes=hashbytes_16)
36 14 40 2.9 0.0 return uuid_
File: _timeits/time_uuids.py
Function: make_uuid_PIL_bytes at line 39
Total time: 1.33926 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
39 @profile
40 def make_uuid_PIL_bytes(gpath):
41 2 25940 12970.0 0.6 pil_img = Image.open(gpath, 'r')
42 # Read PIL image data
43 2 3277455 1638727.5 73.5 img_bytes_ = pil_img.tobytes()
44 2 1158009 579004.5 26.0 uuid_ = get_image_uuid(img_bytes_)
45 2 4 2.0 0.0 return uuid_
File: _timeits/time_uuids.py
Function: make_uuid_NUMPY_bytes at line 48
Total time: 1.39694 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
48 @profile
49 def make_uuid_NUMPY_bytes(gpath):
50 2 3406 1703.0 0.1 pil_img = Image.open(gpath, 'r')
51 # Read PIL image data
52 2 3344608 1672304.0 71.9 np_img = np.asarray(pil_img)
53 2 46 23.0 0.0 np_flat = np_img.ravel()
54 2 133593 66796.5 2.9 img_bytes_ = np_flat.tostring()
55 2 1171888 585944.0 25.2 uuid_ = get_image_uuid(img_bytes_)
56 2 5 2.5 0.0 return uuid_
File: _timeits/time_uuids.py
Function: make_uuid_CONTIG_NUMPY_bytes at line 79
Total time: 1.4899 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
79 @profile
80 def make_uuid_CONTIG_NUMPY_bytes(gpath):
81 2 3384 1692.0 0.1 pil_img = Image.open(gpath, 'r')
82 # Read PIL image data
83 2 3376051 1688025.5 68.0 np_img = np.asarray(pil_img)
84 2 133156 66578.0 2.7 np_flat = np_img.ravel().tostring()
85 2 146959 73479.5 3.0 np_contig = np.ascontiguousarray(np_flat)
86 2 149330 74665.0 3.0 img_bytes_ = np_contig.tostring()
87 2 1154328 577164.0 23.3 uuid_ = get_image_uuid(img_bytes_)
88 2 4 2.0 0.0 return uuid_
리눅스 라인 프로파일 결과입니다 : 여기
File: _timeits/time_uuids.py
Function: make_uuid_NUMPY_STRIDE_64_bytes at line 70
Total time: 0.456272 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
70 @profile
71 def make_uuid_NUMPY_STRIDE_64_bytes(gpath):
72 2 449 224.5 0.1 pil_img = Image.open(gpath, 'r')
73 # Read PIL image data
74 2 452880 226440.0 99.3 img_bytes_ = np.asarray(pil_img).ravel()[::64].
tostring()
75 2 2942 1471.0 0.6 uuid_ = get_image_uuid(img_bytes_)
76 2 1 0.5 0.0 return uuid_
File: _timeits/time_uuids.py
Function: make_uuid_CONTIG_NUMPY_STRIDE_64_bytes at line 102
Total time: 0.457588 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
102 @profile
103 def make_uuid_CONTIG_NUMPY_STRIDE_64_bytes(gpath):
104 2 445 222.5 0.1 pil_img = Image.open(gpath, 'r')
105 # Read PIL image data
106 2 454269 227134.5 99.3 img_bytes_ = np.ascontiguousarray(np.asarray(pi
l_img).ravel()[::64]).tostring()
107 2 2872 1436.0 0.6 uuid_ = get_image_uuid(img_bytes_)
108 2 2 1.0 0.0 return uuid_
File: _timeits/time_uuids.py
Function: make_uuid_CONTIG_NUMPY_STRIDE_16_bytes at line 91
Total time: 0.461928 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
91 @profile
92 def make_uuid_CONTIG_NUMPY_STRIDE_16_bytes(gpath):
93 2 482 241.0 0.1 pil_img = Image.open(gpath, 'r')
94 # Read PIL image data
95 2 436622 218311.0 94.5 np_img = np.asarray(pil_img)
96 2 10990 5495.0 2.4 np_contig = np.ascontiguousarray(np_img.ravel()
[::16])
97 2 2931 1465.5 0.6 img_bytes_ = np_contig.tostring()
98 2 10902 5451.0 2.4 uuid_ = get_image_uuid(img_bytes_)
99 2 1 0.5 0.0 return uuid_
File: _timeits/time_uuids.py
Function: make_uuid_NUMPY_STRIDE_16_bytes at line 59
Total time: 0.492819 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
59 @profile
60 def make_uuid_NUMPY_STRIDE_16_bytes(gpath):
61 2 481 240.5 0.1 pil_img = Image.open(gpath, 'r')
62 # Read PIL image data
63 2 441343 220671.5 89.6 np_img = np.asarray(pil_img)
64 2 34 17.0 0.0 np_flat = np_img.ravel()[::16]
65 2 39996 19998.0 8.1 img_bytes_ = np_flat.tostring()
66 2 10964 5482.0 2.2 uuid_ = get_image_uuid(img_bytes_)
67 2 1 0.5 0.0 return uuid_
File: _timeits/time_uuids.py
Function: get_image_uuid at line 28
Total time: 0.545926 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
28 @profile
29 def get_image_uuid(img_bytes_):
30 # hash the bytes using sha1
31 14 545037 38931.2 99.8 bytes_sha1 = hashlib.sha1(img_bytes_)
32 14 115 8.2 0.0 hashbytes_20 = bytes_sha1.digest()
33 # sha1 produces 20 bytes, but UUID requires 16
bytes
34 14 24 1.7 0.0 hashbytes_16 = hashbytes_20[0:16]
35 14 742 53.0 0.1 uuid_ = uuid.UUID(bytes=hashbytes_16)
36 14 8 0.6 0.0 return uuid_
File: _timeits/time_uuids.py
Function: make_uuid_PIL_bytes at line 39
Total time: 0.625736 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
39 @profile
40 def make_uuid_PIL_bytes(gpath):
41 2 3915 1957.5 0.6 pil_img = Image.open(gpath, 'r')
42 # Read PIL image data
43 2 449092 224546.0 71.8 img_bytes_ = pil_img.tobytes()
44 2 172728 86364.0 27.6 uuid_ = get_image_uuid(img_bytes_)
45 2 1 0.5 0.0 return uuid_
File: _timeits/time_uuids.py
Function: make_uuid_NUMPY_bytes at line 48
Total time: 0.663057 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
48 @profile
49 def make_uuid_NUMPY_bytes(gpath):
50 2 468 234.0 0.1 pil_img = Image.open(gpath, 'r')
51 # Read PIL image data
52 2 437346 218673.0 66.0 np_img = np.asarray(pil_img)
53 2 18 9.0 0.0 np_flat = np_img.ravel()
54 2 51512 25756.0 7.8 img_bytes_ = np_flat.tostring()
55 2 173712 86856.0 26.2 uuid_ = get_image_uuid(img_bytes_)
56 2 1 0.5 0.0 return uuid_
File: _timeits/time_uuids.py
Function: make_uuid_CONTIG_NUMPY_bytes at line 79
Total time: 0.756671 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
79 @profile
80 def make_uuid_CONTIG_NUMPY_bytes(gpath):
81 2 483 241.5 0.1 pil_img = Image.open(gpath, 'r')
82 # Read PIL image data
83 2 437192 218596.0 57.8 np_img = np.asarray(pil_img)
84 2 48152 24076.0 6.4 np_flat = np_img.ravel().tostring()
85 2 49502 24751.0 6.5 np_contig = np.ascontiguousarray(np_flat)
86 2 49269 24634.5 6.5 img_bytes_ = np_contig.tostring()
87 2 172072 86036.0 22.7 uuid_ = get_image_uuid(img_bytes_)
88 2 1 0.5 0.0 return uuid_
는 Windows timeit 결과이다 나는 윈도우와 리눅스에서 테스트 :
Running: make_uuid_PIL_bytes
timed: 1.4041314945785952 seconds in make_uuid_PIL_bytes
Running: make_uuid_NUMPY_bytes
timed: 1.4475939890251077 seconds in make_uuid_NUMPY_bytes
Running: make_uuid_NUMPY_STRIDE_16_bytes
timed: 1.136886564762671 seconds in make_uuid_NUMPY_STRIDE_16_bytes
Running: make_uuid_NUMPY_STRIDE_64_bytes
timed: 1.0767879228155284 seconds in make_uuid_NUMPY_STRIDE_64_bytes
Running: make_uuid_CONTIG_NUMPY_bytes
timed: 1.5433727380795146 seconds in make_uuid_CONTIG_NUMPY_bytes
Running: make_uuid_CONTIG_NUMPY_STRIDE_16_bytes
timed: 1.0804961515831941 seconds in make_uuid_CONTIG_NUMPY_STRIDE_16_bytes
Running: make_uuid_CONTIG_NUMPY_STRIDE_64_bytes
timed: 1.0577325560451953 seconds in make_uuid_CONTIG_NUMPY_STRIDE_64_bytes
그리고 리눅스 timeit 결과 :
Running: make_uuid_PIL_bytes
timed: 0.6316661834716797 seconds in make_uuid_PIL_bytes
Running: make_uuid_NUMPY_bytes
timed: 0.666496992111206 seconds in make_uuid_NUMPY_bytes
Running: make_uuid_NUMPY_STRIDE_16_bytes
timed: 0.4908161163330078 seconds in make_uuid_NUMPY_STRIDE_16_bytes
Running: make_uuid_NUMPY_STRIDE_64_bytes
timed: 0.4494049549102783 seconds in make_uuid_NUMPY_STRIDE_64_bytes
Running: make_uuid_CONTIG_NUMPY_bytes
timed: 0.7838680744171143 seconds in make_uuid_CONTIG_NUMPY_bytes
Running: make_uuid_CONTIG_NUMPY_STRIDE_16_bytes
timed: 0.462860107421875 seconds in make_uuid_CONTIG_NUMPY_STRIDE_16_bytes
Running: make_uuid_CONTIG_NUMPY_STRIDE_64_bytes
timed: 0.45322108268737793 seconds in make_uuid_CONTIG_NUMPY_STRIDE_64_bytes
그래서 이미지의 로딩 생겼는 주범이다 (이러한 이미지가 너무 커서 때문에)하지만, 진보는 해싱을 돕기 위해 작은 (그러나 중요한) 양.
그래도 데이터의 하위 집합 만로드하는 것이 좋을 것입니다. 누구든지이 일을 할 수있는 방법을 알고 있습니까?
그러나 이것은 모든 데이터를 암시 적으로 읽지 않습니까? 숫자가없는 배열에 넣은 다음 값을 잘라내시겠습니까? 이것은 느린 것이 아닌가? – Erotemic
천천히 걸릴 수도 있습니다! timeit, 당신이 얻는 것을보십시오 :) 저는 sha1을 만드는 것이 데이터를 읽는 것이 아니라 속도를 제한하는 단계라고 생각합니다. 하지만 np.asarray가 복사본을 만들 수 있습니다. 다시 한 번 시간을 내십시오. – bgschiller
(line_profile을 사용하여) 내가 이미지를 읽는 것이 numpy로 인한 병목 현상 인 것 같습니다. 나중에 더 자세한 테스트 결과를 게시하겠습니다. – Erotemic