2017-03-06 2 views
0

squawka.com에서 찢어진 xml 파일을 읽을 수있는 CSV 파일로 변환하려고합니다. 나는 가정이 작업을 수행하는 스크립트를 찾았지만 항상이 오류가 발생합니다 : 그래서,팬더와 멀티 프로세싱을 사용하는 Python IndexError

import gc 
import glob 
import logging 
import multiprocessing 
import numpy as np 
import os 
import pandas as pd 
import re 
import warnings 
from dateutil import parser 
from functools import partial 
from lxml import etree 


COMPETITIONS = { 
    4: 'World Cup', 
    5: 'Champions League', 
    6: 'Europa League', 
    8: 'English Barclays Premier League', 
    9: 'Dutch Eredivisie', 
    10: 'Football League Championship', 
    21: 'Italian Serie A', 
    22: 'German Bundesliga', 
    23: 'Spanish La Liga', 
    24: 'French Ligue 1', 
    98: 'US Major League Soccer', 
    114: 'Turkish Super Lig', 
    129: 'Russian Premier League', 
    199: 'Mexican Liga MX - Apertura', 
    214: 'Australian A-League', 
    363: 'Brazilian Serie A', 
    385: 'Mexican Liga MX - Clausura', 
} 


TIME_SLICE_EVENTS = [ 
    'action_areas', 
    'all_passes', 
    'balls_out', 
    'blocked_events', 
    'cards', 
    'clearances', 
    'corners', 
    'crosses', 
    'extra_heat_maps', 
    'fouls', 
    'goal_keeping', 
    'goals_attempts', 
    'headed_duals', 
    'interceptions', 
    'keepersweeper', 
    'offside', 
    'oneonones', 
    'setpieces', 
    'tackles', 
    'takeons', 
] 
ALL_STATISTICS = sorted(TIME_SLICE_EVENTS + ['players', 'teams']) 

logger = logging.getLogger() 
handler = logging.StreamHandler() 
formatter = logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s') 
handler.setFormatter(formatter) 
logger.addHandler(handler) 
logger.setLevel(logging.DEBUG) 


class SquawkaReport: 
    """Squawka match report object. 

    :param path: Path to XML-file to generate match report from. 
    """ 

    def __init__(self, path): 
     self.__time_slice_events = TIME_SLICE_EVENTS 
     self.path = path 
     self.xml = self.read_xml(path) 

    # See: https://stackoverflow.com/questions/10967551/how-do-i-dynamically-create-properties-in-python 
    def __getattr__(self, name): 
     if name in self.__time_slice_events: 
      return self._parse_timeslice(name) 
     else: 
      msg = "'{0}' object has no attribute '{1}'" 
      raise AttributeError(msg.format(type(self).__name__, name)) 

    @staticmethod 
    def read_xml(path): 
     """Read XML file. 
     :param path: Path to XML-file. 
     :return: XML tree. 
     """ 
     with open(path, 'r') as f: 
      data = f.read() 
     xml = etree.fromstring(data) 
     return xml 

    def _parse_timeslice(self, filter_type): 
     xpath = '/squawka/data_panel/filters/{filter_type}/time_slice/event' 
     return self._get_elements(xpath.format(filter_type=filter_type)) 

    def _get_elements(self, xpath): 
     elements = self.xml.xpath(xpath) 
     if elements: 
      return self._parse_elements(elements) 
     else: 
      return None 

    def _parse_elements(self, elements): 
     parsed = [dict({c.tag: c.text for c in 
         e.getchildren()}.items() + e.attrib.items()) 
        for e in elements] 
     return parsed 

    @property 
    def competition(self): 
     return re.findall("/(.*)_\d*.xml", self.path)[0] 

    @property 
    def filters(self): 
     filters_element = self.xml.xpath('/squawka/data_panel/filters') 
     if filters_element: 
      return [ch.tag for ch in filters_element[0].getchildren()] 
     # Some match reports don't have data. 
     else: 
      return None 

    @property 
    def kickoff(self): 
     date = self.xml.xpath("/squawka/data_panel/game/kickoff/text()")[0] 
     return parser.parse(date).strftime('%Y-%m-%d %H:%M:%S %z') 

    @property 
    def match_id(self): 
     return int(re.findall("/.*_(\d+).xml", self.path)[0]) 

    @property 
    def name(self): 
     return self.xml.xpath("/squawka/data_panel/game/name/text()")[0] 

    @property 
    def players(self): 
     # TODO: Remove non-player elements 
     xpath = '/squawka/data_panel/players/player' 
     return self._get_elements(xpath) 

    @property 
    def teams(self): 
     xpath = '/squawka/data_panel/game/team' 
     return self._get_elements(xpath) 

    @property 
    def venue(self): 
     return self.xml.xpath("/squawka/data_panel/game/venue/text()")[0] 

    @property 
    def match_info(self): 
     info = ({ 
      'competition': self.competition, 
      'kickoff': self.kickoff, 
      'match_id': self.match_id, 
      'name': self.name, 
      'venue': self.venue, 
     }) 
     for team in self.teams: 
      for k in ['id', 'short_name']: 
       info['_'.join((team['state'], k))] = team[k] 
     return info 


def stats_from_file(path, statistic, convert=True): 
    """Load data for a statistic from file. 

    :param path: Path to file. 
    :param statistic: Statistic to load (e.g. 'goals_attempts', 'cards'). 
    :param convert: Process and clean the data (boolean) 
    :return pd.DataFrame with data 
    """ 
    report = SquawkaReport(path) 
    return stats_from_report(report, statistic, convert) 


def stats_from_report(report, statistic, convert=True): 
    """Load data for a statistic from a SquawkaReport object. 

    :param report: SquawkaReport object 
    :param statistic: Statistic to load (e.g. 'goals_attempts', 'cards'). 
    :param convert: Process and clean the data (boolean) 
    :return pd.DataFrame with data 
    """ 
    stats = pd.DataFrame(getattr(report, statistic)) 
    stats['competition'] = report.competition 
    stats['kickoff'] = report.kickoff 
    stats['match_id'] = report.match_id 
    if convert: 
     return convert_export(stats) 
    else: 
     return stats 


def export_all_stats(xml_dir, out_dir, statistics=ALL_STATISTICS, convert=True, n_jobs=None, 
        sequential=('all_passes', 'extra_heat_maps')): 
    """Export all statistics from all XML-files in a folder to CSV. 

    :param xml_dir: Path to folder containing XML-files 
    :param out_dir: Path to folder to save output to 
    :param statistics: Statistics to export 
    :param convert: Process and clean the data (boolean) 
    :param n_jobs: Number of processes to use 
    :param sequential: Iterable with statistics to process sequentially (for memory-intensive stats) 
    """ 

    xml_paths = glob.glob(os.path.join(xml_dir, '*.xml')) 

    if n_jobs is None: 
     n_jobs = multiprocessing.cpu_count() - 1 

    if not os.path.exists(out_dir): 
     os.makedirs(out_dir) 

    pool = multiprocessing.Pool(n_jobs) 
    for statistic in statistics: 
     if statistic in sequential: 
      df = pd.concat((_load_xml(p, statistic) for p in xml_paths), axis=0, ignore_index=True) 
     else: 
      partial_loader = partial(_load_xml, statistic=statistic) 
      df = pd.concat(pool.imap(partial_loader, xml_paths), axis=0, ignore_index=True) 
     if convert: 
      df = convert_export(df) 
     save_path = os.path.join(out_dir, '{}.csv'.format(statistic)) 
     df.to_csv(save_path, index=False, encoding='utf8') 
     logger.debug("Exported %s to %s", statistic, save_path) 


def _load_xml(path, statistic): 
    """Load XML files ignoring etree.XMLSyntaxErrors. 

    :param path: Path to file. 
    :param statistic: Statistic to load (e.g. 'goals_attempts', 'cards'). 
    :return: XML tree (or None on etree.XMLSyntaxError). 
    """ 
    try: 
     return stats_from_file(path, statistic) 
    except etree.XMLSyntaxError: 
     msg = "XML error loading {}, skipping it...".format(path) 
     warnings.warn(msg, RuntimeWarning) 


def convert_export(df): 
    """Convert a statistics export. 
    :param df: pd.DataFrame with statistics (see e.g. stats_from_file()) 
    :return: processed pd.DataFrame 
    """ 

    def parse_indicator(s, indicator): 
     return s.notnull() & (s == indicator) # Nulls are interpreted as False 

    convert_cols = { 
     'id': 'int', 
     'match_id': 'int', 
     'mins': 'int', 
     'minsec': 'int', 
     'secs': 'int', 
     'team_id': 'int' 
    } 
    coordinate_cols = [ 
     'end', 
     'loc', 
     'middle', 
     'start', 
    ] 
    indicator_cols = { 
     'is_own': 'yes', 
     'headed': 'true', # Note: ignores all falses 
     'shot': 'true', # Note: ignores all falses 
    } 
    # Convert strings to ints. 
    for col in df.columns.intersection(convert_cols): 
     df[col] = df[col].replace('', -1) 
     df.loc[df[col].isnull(), col] = -1 
     df[col] = df[col].astype(convert_cols[col]) 

    # Convert indicator cols. 
    for col in df.columns.intersection(indicator_cols): 
     df[col] = parse_indicator(df[col], indicator_cols[col]) 

    # Convert coordinate cols. 
    for col in df.columns.intersection(coordinate_cols): 
     df[[col + '_x', col + '_y']] = split_coordinates(df[col]) 
     df.drop(col, axis=1, inplace=True) 

    return df 


def split_coordinates(s): 
    """Split Series containing strings with coordinates into a DataFrame. 

    :param s: pd.Series 
    :return: pd.DataFrame with columns 'x' and 'y' 
    """ 
    if s.notnull().all(): 
     concatenated = s 
    else: 
     concatenated = s.copy() 
     concatenated.loc[concatenated.isnull()] = ',' 
    split = pd.DataFrame(concatenated.str.split(',').tolist(), columns=['x', 'y'], dtype=float) 
    return split.replace('', np.nan) 

나는이 아주 새로운 오전 :이 스크립트에서 함수 export_all_stats를 사용하는 것을 시도하고있다

Traceback (most recent call last): 
    File "<stdin>", line 1, in <module> 
    File "squawka\utils.py", line 227, in export_all_stats 
    df = pd.concat(pool.imap(partial_loader, xml_paths), axis=0, ignore_index=True) 
    File "C:\Python27\lib\site-packages\pandas\tools\merge.py", line 812, in concat 
    copy=copy) 
    File "C:\Python27\lib\site-packages\pandas\tools\merge.py", line 842, in __init__ 
    objs = list(objs) 
    File "C:\Python27\lib\multiprocessing\pool.py", line 668, in next 
    raise value 
IndexError: list index out of range 

을 나는 무엇이 잘못되었는지 전혀 모른다. 나는 Google에 시도했지만 대답을 찾지 못했습니다. 누군가 나를 도울 수 있습니까?

답변

0

변수에 pool.imap(partial_loader, xml_paths)을 할당하고 결과의 모양을보고 문제 해결을 시작합니다. 해당 개체를 pd.concat으로 전달할 때 몇 가지 문제가 있습니다. 그 개체를 보지 않고, 내 생각 엔 그 개체는 실제로 무시할 색인을 가지고 있지 않으며, ignore_index=True을 설정하면 존재하지 않는 것을 찾으려고하므로 IndexError: list index out of range입니다.