2017-12-05 30 views
2

AWS p2.x8large를 사용하고 k- 배 교차 유효성 검사를 사용하여 내 모델을 평가하려고합니다. 첫 번째 반복이 끝나면 GPU 메모리가 가득 차고 다시 훈련을 시도 할 때 cuda 메모리 문제가 발생합니다.MXnet 백엔드가있는 Keras 1.2.2를 사용하여 GPU 메모리 재설정

제 질문은 루프 내에서 GPU 메모리를 재설정하는 방법입니다. 나는 K.clear_session()과 gc.collect()를 사용했으나 그들 중 누구도 일하지 않았다.

오류 메시지 :

> MXNetError        Traceback (most recent call 
> last) ~/anaconda3/lib/python3.6/site-packages/mxnet/symbol.py in 
> simple_bind(self, ctx, grad_req, type_dict, group2ctx, 
> shared_arg_names, shared_exec, shared_buffer, **kwargs) 1472  
> shared_exec_handle, 
> -> 1473             ctypes.byref(exe_handle))) 1474   except MXNetError as e: 
> 
> ~/anaconda3/lib/python3.6/site-packages/mxnet/base.py in 
> check_call(ret) 
>  128  if ret != 0: 
> --> 129   raise MXNetError(py_str(_LIB.MXGetLastError())) 
>  130 
> 
> MXNetError: [19:24:04] src/storage/./pooled_storage_manager.h:102: 
> cudaMalloc failed: out of memory 
> 
> Stack trace returned 10 entries: [bt] (0) 
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x1d57cc) 
> [0x7f55ce9fe7cc] [bt] (1) 
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x1242238) 
> [0x7f55cfa6b238] [bt] (2) 
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x1244c0a) 
> [0x7f55cfa6dc0a] [bt] (3) 
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0xe4d4db) 
> [0x7f55cf6764db] [bt] (4) 
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0xe549cd) 
> [0x7f55cf67d9cd] [bt] (5) 
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0xe59f95) 
> [0x7f55cf682f95] [bt] (6) 
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0xe5d6ee) 
> [0x7f55cf6866ee] [bt] (7) 
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0xe5dcd4) 
> [0x7f55cf686cd4] [bt] (8) 
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(MXExecutorSimpleBind+0x2261) 
> [0x7f55cf605291] [bt] (9) 
> /home/ubuntu/anaconda3/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) 
> [0x7f560d6c4ec0] 
> 
> 
> During handling of the above exception, another exception occurred: 
> 
> RuntimeError        Traceback (most recent call 
> last) <ipython-input-4-0720b69f15af> in <module>() 
>  33 if val_batches.n>0: 
>  34  hist = model.fit_generator(generator=train_gen, samples_per_epoch=batches.n, 
> ---> 35   nb_epoch=epochs, verbose=True, validation_data=val_gen, nb_val_samples=val_batches.n, 
> callbacks=callbacks) 
>  36 else: 
>  37  model.fit_generator(generator=train_gen, samples_per_epoch=batches.n, 
> 
> ~/anaconda3/lib/python3.6/site-packages/Keras-1.2.2-py3.6.egg/keras/engine/training.py 
> in fit_generator(self, generator, samples_per_epoch, nb_epoch, 
> verbose, callbacks, validation_data, nb_val_samples, class_weight, 
> max_q_size, nb_worker, pickle_safe, initial_epoch) 1557    
> outs = self.train_on_batch(x, y, 1558        
> sample_weight=sample_weight, 
> -> 1559            class_weight=class_weight) 1560  1561      if not 
> isinstance(outs, list): 
> 
> ~/anaconda3/lib/python3.6/site-packages/Keras-1.2.2-py3.6.egg/keras/engine/training.py 
> in train_on_batch(self, x, y, sample_weight, class_weight) 1320  
> ins = x + y + sample_weights 1321   
> self._make_train_function() 
> -> 1322   outputs = self.train_function(ins) 1323   if len(outputs) == 1: 1324    return outputs[0] 
> 
> ~/anaconda3/lib/python3.6/site-packages/Keras-1.2.2-py3.6.egg/keras/engine/training.py 
> in train_function(inputs) 1952   def 
> _make_train_function(self): 1953    def train_function(inputs): 
> -> 1954     data, label, _, data_shapes, label_shapes = self._adjust_module(inputs, 'train') 1955  1956     
> batch = K.mx.io.DataBatch(data=data, label=label, bucket_key='train', 
> 
> ~/anaconda3/lib/python3.6/site-packages/Keras-1.2.2-py3.6.egg/keras/engine/training.py 
> in _adjust_module(self, inputs, phase) 1908    if not 
> self._mod.binded: 1909     
> self._mod.bind(data_shapes=data_shapes, label_shapes=None, 
> -> 1910        for_training=True) 1911     self._set_weights() 1912     
> self._mod.init_optimizer(kvstore=self._kvstore, 
> optimizer=self.optimizer) 
> 
> ~/anaconda3/lib/python3.6/site-packages/mxnet/module/bucketing_module.py 
> in bind(self, data_shapes, label_shapes, for_training, 
> inputs_need_grad, force_rebind, shared_module, grad_req) 
>  322       state_names=self._state_names) 
>  323   module.bind(data_shapes, label_shapes, for_training, inputs_need_grad, 
> --> 324      force_rebind=False, shared_module=None, grad_req=grad_req) 
>  325   self._curr_module = module 
>  326   self._curr_bucket_key = self._default_bucket_key 
> 
> ~/anaconda3/lib/python3.6/site-packages/mxnet/module/module.py in 
> bind(self, data_shapes, label_shapes, for_training, inputs_need_grad, 
> force_rebind, shared_module, grad_req) 
>  415              fixed_param_names=self._fixed_param_names, 
>  416              grad_req=grad_req, 
> --> 417              state_names=self._state_names) 
>  418   self._total_exec_bytes = self._exec_group._total_exec_bytes 
>  419   if shared_module is not None: 
> 
> ~/anaconda3/lib/python3.6/site-packages/mxnet/module/executor_group.py 
> in __init__(self, symbol, contexts, workload, data_shapes, 
> label_shapes, param_names, for_training, inputs_need_grad, 
> shared_group, logger, fixed_param_names, grad_req, state_names) 
>  229   self.num_outputs = len(self.symbol.list_outputs()) 
>  230 
> --> 231   self.bind_exec(data_shapes, label_shapes, shared_group) 
>  232 
>  233  def decide_slices(self, data_shapes): 
> 
> ~/anaconda3/lib/python3.6/site-packages/mxnet/module/executor_group.py 
> in bind_exec(self, data_shapes, label_shapes, shared_group, reshape) 
>  325    else: 
>  326     self.execs.append(self._bind_ith_exec(i, data_shapes_i, label_shapes_i, 
> --> 327              shared_group)) 
>  328 
>  329   self.data_shapes = data_shapes 
> 
> ~/anaconda3/lib/python3.6/site-packages/mxnet/module/executor_group.py 
> in _bind_ith_exec(self, i, data_shapes, label_shapes, shared_group) 
>  601           type_dict=input_types, shared_arg_names=self.param_names, 
>  602           shared_exec=shared_exec, 
> --> 603           shared_buffer=shared_data_arrays, **input_shapes) 
>  604   self._total_exec_bytes += int(executor.debug_str().split('\n')[-3].split()[1]) 
>  605   return executor 
> 
> ~/anaconda3/lib/python3.6/site-packages/mxnet/symbol.py in 
> simple_bind(self, ctx, grad_req, type_dict, group2ctx, 
> shared_arg_names, shared_exec, shared_buffer, **kwargs) 1477  
> error_msg += "%s: %s\n" % (k, v) 1478    error_msg += "%s" 
> % e 
> -> 1479    raise RuntimeError(error_msg) 1480  1481   # update shared_buffer 
> 
> RuntimeError: simple_bind error. Arguments: input_1_1: (64, 3, 224, 
> 224) [19:24:04] src/storage/./pooled_storage_manager.h:102: cudaMalloc 
> failed: out of memory 
> 
> Stack trace returned 10 entries: [bt] (0) 
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x1d57cc) 
> [0x7f55ce9fe7cc] [bt] (1) 
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x1242238) 
> [0x7f55cfa6b238] [bt] (2) 
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x1244c0a) 
> [0x7f55cfa6dc0a] [bt] (3) 
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0xe4d4db) 
> [0x7f55cf6764db] [bt] (4) 
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0xe549cd) 
> [0x7f55cf67d9cd] [bt] (5) 
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0xe59f95) 
> [0x7f55cf682f95] [bt] (6) 
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0xe5d6ee) 
> [0x7f55cf6866ee] [bt] (7) 
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0xe5dcd4) 
> [0x7f55cf686cd4] [bt] (8) 
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(MXExecutorSimpleBind+0x2261) 
> [0x7f55cf605291] [bt] (9) 
> /home/ubuntu/anaconda3/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) 
> [0x7f560d6c4ec0] 

enter image description here

+0

루프가 수행하는 작업은 무엇입니까? 하이퍼 매개 변수 검색? 그렇다면 각 평가에 대해 새로운 프로세스를 시작할 수도 있습니다. – geoalgo

+0

나는 k-fold 교차 검증을하고 있습니다. 현재 저는 매 30 분마다 cron 작업을 실행하고 있지만 ... 매우 귀찮습니다. –

답변

1

내가 두 번 단일 실행의를 GPU 메모리 풋 프린트를 제한 할 수 있었다 gc.collect 사용. 그것 없이는 메모리 사용량이 계속 증가했습니다. 나는 결과를 반환 한 후에 gcmodel을 정리하는 것을 허용하는 훈련 및 평가 기능을 만들었습니다.

cv_results = [] 
for train, test in cv_folds: 
    result = train_and_eval(train, test) 
    cv_results.append(result) 
    gc.collect() 

풋 프린트가 여전히 두 배이므로,이를 보완하기 위해 배치 크기를 줄여야합니다. 그러면 모든 것을 GPU 메모리에 넣을 수 있어야합니다.

MXNet이 실제로 GPU에서 메모리를 할당 해제하지 않는다면 나중에 사용할 수 있도록 메모리를 내부 메모리 풀에 다시 추가해야합니다. 따라서 GPU 메모리 사용량이 nvidia-smi에서 여전히 높게 나타날 수 있지만 메모리는 여전히 MXNet에서 자유롭게 사용할 수 있습니다. 대부분의 계산 단계와 마찬가지로이 메모리의 가비지 수집은 비동기 적으로 발생합니다.

GPU에 2 런 분의 메모리를 저장할 수 없다면 다음과 비슷한 방법으로 geoalgo에서 언급 한대로 하위 프로세스를 시작할 수 있습니다.

from subprocess import Popen, PIPE, STDOUT 
import json 

def eval_on_fold(indicies): 
    indicies_str = json.dumps(indicies) 
    p = Popen(['python', 'train_and_eval.py', '--cv-indicies'], stdout=PIPE, stdin=PIPE, stderr=PIPE) 
    eval_metric_str = p.communicate(input=indicies_str)[0] 
    eval_metric = float(eval_metric_str) 
    return eval_metric 

cv_results = [] 
for train, test in cv_folds: 
    indicies = {'train': train, 'test': test} 
    eval_metric = eval_on_fold(indicies) 
    cv_results.append(eval_metric)