Bulk (helper) Elastic 검색에서 인덱싱 오류가 발생했습니다.

나는 트위터에서 데이터를 가져 와서 필터링하고 생성기를 만들었으며 elasticsearch에 헬퍼를 사용하여 일괄 색인을 만들려고했지만 다음 오류가 발생했습니다. 문제는.Bulk (helper) Elastic 검색에서 인덱싱 오류가 발생했습니다.

Traceback (most recent call last): 
    File "/Users/aqm1152/_acert_/basic/test_collection_dump.py", line 245, in <module> 
    sinceid, complete, api_counter, maxid = search_tweets(qu=query_word, cnt=cnt, sinceid=x , maxitr= 149 , fname=query_word) 
    File "/Users/aqm1152/_acert_/basic/test_collection_dump.py", line 138, in search_tweets 
    res = elastic_search.bulk_es(actions=bulk_content,) 
    File "/Users/aqm1152/_acert_/basic/elasticsearch/acert_basic_elastic_functions.py", line 68, in bulk_es 
    return helpers.bulk(self.es, actions=actions ,stats_only=True) 
    File "/Users/aqm1152/anaconda/lib/python3.5/site-packages/elasticsearch/helpers/__init__.py", line 194, in bulk 
    for ok, item in streaming_bulk(client, actions, **kwargs): 
    File "/Users/aqm1152/anaconda/lib/python3.5/site-packages/elasticsearch/helpers/__init__.py", line 162, in streaming_bulk 
    for result in _process_bulk_chunk(client, bulk_actions, raise_on_exception, raise_on_error, **kwargs): 
    File "/Users/aqm1152/anaconda/lib/python3.5/site-packages/elasticsearch/helpers/__init__.py", line 134, in _process_bulk_chunk 
    raise BulkIndexError('%i document(s) failed to index.' % len(errors), errors) 
elasticsearch.helpers.BulkIndexError: ('46 document(s) failed to index.', [{'index': {'_index': 'twitter', '_id': '866553007252488192', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866552145507700736', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866479151317962752', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866477250459430913', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866455181839486976', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866411931405570048', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866400265573892096', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866399318957318144', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866395810300403713', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866366506124365824', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866228703478636545', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866206827389865984', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866137742476025856', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866026883284164610', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865968929684029441', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865728096019894273', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865707222453571585', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865675939128029185', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865626970817572865', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865564611591815168', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865553684163211268', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865519159467098113', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865466383684874240', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865362662879895552', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865339244604264449', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865331847710068736', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865251599928700928', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865246748603797505', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865230204293308416', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865229926622011392', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865194609349083136', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865165953612619777', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865165573289902082', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865083343917993984', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865078786655694849', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865078053134905344', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '864963278233096192', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '864948505143635968', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '864929970702962688', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '864871369217015809', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '864812084521046016', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '864742828550836224', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '864662384060792832', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '864521704248418304', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '864511301221068800', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '864310734817083392', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}])

많은 분야의 elasticsearch는 위치 정보 데이터 인 것으로 보이는 것의 섭취에 문제가있는 것으로 보입니다. 또한 나는 약 671 개의 짹짹을 만들었지 만 탄력적 인 검색에서 카운트 할 때 인덱스 된 것으로 보이는 454 개의 트윗을 생성했으며, 46 개의 doc 짹짹이 없어 지리적 데이터 때문에 실패했기 때문에 어떤 필드를 정확하게 말하는 것 같아요. 그것은.

# Global variable 
tweet_attributes = ['text','source','retweeted', 'retweet_count','place','lang','favorite_count','entities','id','created_at','user:id','user:screen_name','coordinates'] 
def _get_necessary_fields(tweets): 
    doc = defaultdict(dict) 
    fieldInfo = tweet_attributes 
    for tweet in tweets: 
     for fields in fieldInfo : 
      if (len(fields.split(':'))) == 2 : 
       keys = fields.split(':') 
       # nested array at one level 
       doc[keys[0]][keys[1]] = tweet[keys[0]][keys[1]] 
       #TODO implement for more than one level, needs better algo 
      else: 
       # for each field 
       if fields in tweet: 
        doc[fields] = tweet[fields] 
     yield doc 

def _json_for_bulk_body(tweets,el): 
    # TODO refactor this code when you have time : 
    # http://stackoverflow.com/questions/20288770/how-to-use-bulk-api-to-store-the-keywords-in-es-by-using-python 
    structured_json_body = ({ 
       "_op_type" : "index", 
       "_index": el[0], # index name Twitter 
       "_type": el[1][0], # type is tweet 
       "_id": doc['id'], # id of the tweet 
       "_source" :doc} for doc in _get_necessary_fields(tweets)) 
    return structured_json_body 


    **helpers.bulk(self.es, actions=structured_json_body ,stats_only=True)**

수있는 사람 :

여기

{ 
    "template": "twitter", 
    "settings": { 
     "number_of_shards": 1, 
     "number_of_replicas": 0 
    }, 
    "mappings": { 
     "tweet": { 
     "properties": { 
      "coordinates": { 
       "type": "geo_point" 
      }, 
      "created_at": { 
       "format": "EEE MMM dd HH:mm:ss Z YYYY", 
       "type": "date" 
      }, 
      "entities": { 
       "properties": { 
        "hashtags": { 
        "properties": { 
         "indices": { 
          "type": "long", 
          "index": "not_analyzed" 
         }, 
         "text": { 
          "type": "text" 
         } 
        } 
        }, 
        "urls": { 
        "properties": { 
         "display_url": { 
          "type": "text", 
          "index": "not_analyzed" 
         }, 
         "expanded_url": { 
          "type": "text", 
          "index": "not_analyzed" 
         }, 
         "indices": { 
          "type": "long", 
          "index": "not_analyzed" 
         }, 
         "url": { 
          "type": "text", 
          "index": "not_analyzed" 
         } 
        } 
        } 
       } 
      }, 
      "symbols": { 
       "type": "integer", 
       "index": "not_analyzed" 
      }, 
      "favorite_count": { 
       "type": "double", 
       "index": "not_analyzed" 
      }, 
      "id": { 
       "type": "long" 
      }, 
      "lang": { 
       "type": "text", 
       "index": "analyzed" 
      }, 
      "place": { 
       "properties": { 
        "attributes": { 
        "type": "object" 
        }, 
        "bounding_box": { 
        "type": "geo_point" 
        }, 
        "country": { 
        "type": "text", 
        "index": "no" 
        }, 
        "country_code": { 
        "type": "text" 
        }, 
        "full_name": { 
        "type": "text", 
        "index": "no" 
        }, 
        "id": { 
        "type": "text" 
        }, 
        "name": { 
        "type": "text", 
        "index": "not_analyzed" 
        }, 
        "place_type": { 
        "type": "text" 
        }, 
        "url": { 
        "type": "text" 
        } 
       } 
      }, 
      "retweet_count": { 
       "type": "long" 
      }, 
      "source": { 
       "type": "text" 
      }, 
      "text": { 
       "type": "text" 
      }, 
      "user": { 
       "type":"object", 
       "properties": { 
        "id": { 
        "type": "long" 
        }, 
        "screen_name": { 
        "type": "text" 
        } 
       } 
      } 
     } 
     } 
    } 
}

내가 섭취 내 발전기를 만들고 helpers.bulk 사용하여 사용하고 내 코드입니다 : 여기

내가 인덱싱에 사용하는 내 템플릿입니다 왜 46 세 이상의 모든 문서가 섭취되지 않는지, 그리고 왜 46 개의 문서가 섭취되지 않는지에 대해 설명해주십시오.

코드에서

출처

2017-05-22 Aboogie

을 발생하는 경우 필드 coordinates을 설정하는 것이 널 상태를 확인하지 필요 ]'. 일부 문서에 '좌표'가 없을 수 있습니까? – Val

네, 그 문서의 일부는 그들이 트위터에서 위치가 없기 때문에 null입니다, 어떻게 내 매핑 에서이 오류를 피할 수 있습니까? – Aboogie

, 당신은 사건이`필드로 시작해야 함 [위도], [경도] 또는 [geohash와 오류가

  # for each field 
      if fields in tweet: 
       if tweet[fields] is not None:     <--- add this check 
        doc[fields] = tweet[fields]

출처

2017-05-22 15:25:43 Val

Bulk (helper) Elastic 검색에서 인덱싱 오류가 발생했습니다.

답변

관련 문제