나는 normalize
매우 중첩 된 json 파일을 분석하려고합니다. 내가 고심하고있는 것은 정상화하기 위해 한 단계 이상의 깊은 곳으로가는 법이다.매우 중첩 된 json으로 pandas.io.json.json_normalize
정확하게 수행하고 싶기 때문에 pandas.io.json.json_normalize 문서를 검토했습니다.
나는 그 일부를 정상화 할 수 있었고 이제는 사전이 어떻게 작동 하는지를 이해할 수 있었지만 나는 아직 거기에 없다.
아래 코드를 사용하면 첫 번째 레벨 만 얻을 수 있습니다.
import json
import pandas as pd
from pandas.io.json import json_normalize
with open('authors_sample.json') as f:
d = json.load(f)
raw = json_normalize(d['hits']['hits'])
authors = json_normalize(data = d['hits']['hits'],
record_path = '_source',
meta = ['_id', ['_source', 'journal'], ['_source', 'title'],
['_source', 'normalized_venue_name']
])
나는 아래의 코드를 사용하여 '저자'사전에 '발굴'에 노력하고 있지만, record_path = ['_source', 'authors']
나에게 TypeError: string indices must be integers
가 발생합니다. 지금까지 내가 이해 한대로 json_normalize
로직이 좋았지 만, 여전히 dict
대 list
으로 json으로 다이빙하는 방법을 이해하지 못했습니다.
심지어이 간단한 example을 검토했습니다.
authors = json_normalize(data = d['hits']['hits'],
record_path = ['_source', 'authors'],
meta = ['_id', ['_source', 'journal'], ['_source', 'title'],
['_source', 'normalized_venue_name']
])
아래는 json 파일 (5 개 레코드)의 덩어리입니다.
{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
u'hits': {u'hits': [{u'_id': u'7CB3F2AD',
u'_index': u'scibase_listings',
u'_score': 1.0,
u'_source': {u'authors': None,
u'deleted': 0,
u'description': None,
u'doi': u'',
u'is_valid': 1,
u'issue': None,
u'journal': u'Physical Review Letters',
u'link': None,
u'meta_description': None,
u'meta_keywords': None,
u'normalized_venue_name': u'phys rev lett',
u'pages': None,
u'parent_keywords': [u'Chromatography',
u'Quantum mechanics',
u'Particle physics',
u'Quantum field theory',
u'Analytical chemistry',
u'Quantum chromodynamics',
u'Physics',
u'Mass spectrometry',
u'Chemistry'],
u'pub_date': u'1987-03-02 00:00:00',
u'pubtype': None,
u'rating_avg_weighted': 0,
u'rating_clarity': 0.0,
u'rating_clarity_weighted': 0.0,
u'rating_innovation': 0.0,
u'rating_innovation_weighted': 0.0,
u'rating_num_weighted': 0,
u'rating_reproducability': 0,
u'rating_reproducibility_weighted': 0.0,
u'rating_versatility': 0.0,
u'rating_versatility_weighted': 0.0,
u'review_count': 0,
u'tag': [u'mass spectra', u'elementary particles', u'bound states'],
u'title': u'Evidence for a new meson: A quasinuclear NN-bar bound state',
u'userAvg': 0.0,
u'user_id': None,
u'venue_name': u'Physical Review Letters',
u'views_count': 0,
u'volume': None},
u'_type': u'listing'},
{u'_id': u'7AF8EBC3',
u'_index': u'scibase_listings',
u'_score': 1.0,
u'_source': {u'authors': [{u'affiliations': [u'Punjabi University'],
u'author_id': u'780E3459',
u'author_name': u'munish puri'},
{u'affiliations': [u'Punjabi University'],
u'author_id': u'48D92C79',
u'author_name': u'rajesh dhaliwal'},
{u'affiliations': [u'Punjabi University'],
u'author_id': u'7D9BD37C',
u'author_name': u'r s singh'}],
u'deleted': 0,
u'description': None,
u'doi': u'',
u'is_valid': 1,
u'issue': None,
u'journal': u'Journal of Industrial Microbiology & Biotechnology',
u'link': None,
u'meta_description': None,
u'meta_keywords': None,
u'normalized_venue_name': u'j ind microbiol biotechnol',
u'pages': None,
u'parent_keywords': [u'Nuclear medicine',
u'Psychology',
u'Hydrology',
u'Chromatography',
u'X-ray crystallography',
u'Nuclear fusion',
u'Medicine',
u'Fluid dynamics',
u'Thermodynamics',
u'Physics',
u'Gas chromatography',
u'Radiobiology',
u'Engineering',
u'Organic chemistry',
u'High-performance liquid chromatography',
u'Chemistry',
u'Organic synthesis',
u'Psychotherapist'],
u'pub_date': u'2008-04-04 00:00:00',
u'pubtype': None,
u'rating_avg_weighted': 0,
u'rating_clarity': 0.0,
u'rating_clarity_weighted': 0.0,
u'rating_innovation': 0.0,
u'rating_innovation_weighted': 0.0,
u'rating_num_weighted': 0,
u'rating_reproducability': 0,
u'rating_reproducibility_weighted': 0.0,
u'rating_versatility': 0.0,
u'rating_versatility_weighted': 0.0,
u'review_count': 0,
u'tag': [u'flow rate',
u'operant conditioning',
u'packed bed reactor',
u'immobilized enzyme',
u'specific activity'],
u'title': u'Development of a stable continuous flow immobilized enzyme reactor for the hydrolysis of inulin',
u'userAvg': 0.0,
u'user_id': None,
u'venue_name': u'Journal of Industrial Microbiology & Biotechnology',
u'views_count': 0,
u'volume': None},
u'_type': u'listing'},
{u'_id': u'7521A721',
u'_index': u'scibase_listings',
u'_score': 1.0,
u'_source': {u'authors': [{u'author_id': u'7FF872BC',
u'author_name': u'barbara eileen ryan'}],
u'deleted': 0,
u'description': None,
u'doi': u'',
u'is_valid': 1,
u'issue': None,
u'journal': u'The American Historical Review',
u'link': None,
u'meta_description': None,
u'meta_keywords': None,
u'normalized_venue_name': u'american historical review',
u'pages': None,
u'parent_keywords': [u'Social science',
u'Politics',
u'Sociology',
u'Law'],
u'pub_date': u'1992-01-01 00:00:00',
u'pubtype': None,
u'rating_avg_weighted': 0,
u'rating_clarity': 0.0,
u'rating_clarity_weighted': 0.0,
u'rating_innovation': 0.0,
u'rating_innovation_weighted': 0.0,
u'rating_num_weighted': 0,
u'rating_reproducability': 0,
u'rating_reproducibility_weighted': 0.0,
u'rating_versatility': 0.0,
u'rating_versatility_weighted': 0.0,
u'review_count': 0,
u'tag': [u'social movements'],
u'title': u"Feminism and the women's movement : dynamics of change in social movement ideology, and activism",
u'userAvg': 0.0,
u'user_id': None,
u'venue_name': u'The American Historical Review',
u'views_count': 0,
u'volume': None},
u'_type': u'listing'},
{u'_id': u'7DAEB9A4',
u'_index': u'scibase_listings',
u'_score': 1.0,
u'_source': {u'authors': [{u'author_id': u'0299B8E9',
u'author_name': u'fraser j harbutt'}],
u'deleted': 0,
u'description': None,
u'doi': u'',
u'is_valid': 1,
u'issue': None,
u'journal': u'The American Historical Review',
u'link': None,
u'meta_description': None,
u'meta_keywords': None,
u'normalized_venue_name': u'american historical review',
u'pages': None,
u'parent_keywords': [u'Superconductivity',
u'Nuclear fusion',
u'Geology',
u'Chemistry',
u'Metallurgy'],
u'pub_date': u'1988-01-01 00:00:00',
u'pubtype': None,
u'rating_avg_weighted': 0,
u'rating_clarity': 0.0,
u'rating_clarity_weighted': 0.0,
u'rating_innovation': 0.0,
u'rating_innovation_weighted': 0.0,
u'rating_num_weighted': 0,
u'rating_reproducability': 0,
u'rating_reproducibility_weighted': 0.0,
u'rating_versatility': 0.0,
u'rating_versatility_weighted': 0.0,
u'review_count': 0,
u'tag': [u'iron'],
u'title': u'The iron curtain : Churchill, America, and the origins of the Cold War',
u'userAvg': 0.0,
u'user_id': None,
u'venue_name': u'The American Historical Review',
u'views_count': 0,
u'volume': None},
u'_type': u'listing'},
{u'_id': u'7B3236C5',
u'_index': u'scibase_listings',
u'_score': 1.0,
u'_source': {u'authors': [{u'author_id': u'7DAB7B72',
u'author_name': u'richard m freeland'}],
u'deleted': 0,
u'description': None,
u'doi': u'',
u'is_valid': 1,
u'issue': None,
u'journal': u'The American Historical Review',
u'link': None,
u'meta_description': None,
u'meta_keywords': None,
u'normalized_venue_name': u'american historical review',
u'pages': None,
u'parent_keywords': [u'Political Science', u'Economics'],
u'pub_date': u'1985-01-01 00:00:00',
u'pubtype': None,
u'rating_avg_weighted': 0,
u'rating_clarity': 0.0,
u'rating_clarity_weighted': 0.0,
u'rating_innovation': 0.0,
u'rating_innovation_weighted': 0.0,
u'rating_num_weighted': 0,
u'rating_reproducability': 0,
u'rating_reproducibility_weighted': 0.0,
u'rating_versatility': 0.0,
u'rating_versatility_weighted': 0.0,
u'review_count': 0,
u'tag': [u'foreign policy'],
u'title': u'The Truman Doctrine and the origins of McCarthyism : foreign policy, domestic politics, and internal security, 1946-1948',
u'userAvg': 0.0,
u'user_id': None,
u'venue_name': u'The American Historical Review',
u'views_count': 0,
u'volume': None},
u'_type': u'listing'}],
u'max_score': 1.0,
u'total': 36429433},
u'timed_out': False,
u'took': 170}
감사합니다. 나는 그대의 모범으로 나는 그것이 어떻게 '어지럽 혀지지 않는지'를 이해했다고 생각한다. 내가 잘못 본 것이 아니라면, '출처'는 엉킴이없고 ID와 유형이 일치합니다. 내가 그것으로 뛰어 들게 해주세요. 당신은 최고예요! 하지만 그 코드는 내게이 오류를 준다 : TypeError : 'NoneType'유형의 객체에는 len()이 없다. 나는 같은 결과물을 얻지 못한다. –
@DanielVargas : 행 원본에 대한 내부 조인이고 추가 열의 테이블입니다. –
@DanielVargas : 마침내 첫 번째 인수를 올바르게 파악하고 이에 따라 업데이트했습니다. 이것은'_id' 키로 메타 데이터 예제를 보강 할 수있는 기회를주었습니다. –