2017-10-30 18 views
0

나는 내 데이터 세트의 마지막 열의 값을 '만든'또는 '놓친'것으로 예측하고 프로그램을 몇 번 실행했지만 정확도는 항상 100 %. 나는 그것이 약 95-100 % 일 것으로 기대한다. 왜 그런가? 여기의사 결정 나무의 정확도 분류 자 ​​

A,L,-5,8,3,475,11.8,1,1.6,6.1,2,2.7,made 
A,L,-39,10,2,30,18.5,6,5.4,24.3,3,3.1,missed 
A,L,-20,8,3,327,6.2,0,1.8,2.3,2,0,missed 
A,W,16,5,1,504,11.7,0,1,18,2,7.3,missed 
A,L,-5,3,2,547,19.9,0,1.2,23.9,3,7.5,made 
H,W,14,4,2,600,17.6,0,0.5,5.5,2,3.8,made 
H,L,-8,6,3,692,23,1,1.9,4.4,2,4.1,made 
H,L,-10,11,3,171,14.4,0,0.9,25.2,3,5.8,missed 

분류 코드입니다 :

from math import log 
import operator 

def load_csv(filename): 
    headers = ["location","w","final_margin","shot_number","period","game_clock","shot_clock","dribbles","touch_time", 
      "shot_dist","pts_type","close_def_dist","target"] 
    df = pd.read_csv(filename, header=None, names=headers, na_values="?") 

    obj_df=list(df.values.flatten()) 

    i=0 
    new_list=[] 
    while i<len(dataset): 
     new_list.append(obj_df[i:i+13]) 
     i+=13 

    labels = ["location","w","final_margin","shot_number","period","game_clock","shot_clock", "dribbles","touch_time", 
      "shot_dist","pts_type","close_def_dist"] 
    return new_list, labels 

def calcShannonEnt(dataSet): 
    numEntries = len(dataSet) 
    labelCounts = {} 
    for featVec in dataSet: # the the number of unique elements and their occurance 
     currentLabel = featVec[-1] 
     if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 
     labelCounts[currentLabel] += 1 
    shannonEnt = 0.0 
    for key in labelCounts: 
     prob = float(labelCounts[key])/numEntries 
     shannonEnt -= prob * log(prob, 2) # log base 2 
    return shannonEnt 


def splitDataSet(dataSet, axis, value): 
    retDataSet = [] 
    for featVec in dataSet: 
     if featVec[axis] == value: 
      reducedFeatVec = featVec[:axis] # chop out axis used for splitting 
      reducedFeatVec.extend(featVec[axis + 1:]) 
      retDataSet.append(reducedFeatVec) 
    return retDataSet 


def chooseBestFeatureToSplit(dataSet): 
    numFeatures = len(dataSet[0]) - 1 # the last column is used for the labels 
    baseEntropy = calcShannonEnt(dataSet) 
    bestInfoGain = 0.0; 
    bestFeature = -1 
    for i in range(numFeatures): # iterate over all the features 
     featList = [example[i] for example in dataSet] # create a list of all the examples of this feature 
     uniqueVals = set(featList) # get a set of unique values 
     newEntropy = 0.0 
     for value in uniqueVals: 
      subDataSet = splitDataSet(dataSet, i, value) 
      prob = len(subDataSet)/float(len(dataSet)) 
      newEntropy += prob * calcShannonEnt(subDataSet) 


     infoGain = baseEntropy - newEntropy # calculate the info gain; ie reduction in entropy 
     """ 
     print("feature : " + str(i)) 
     print("baseEntropy : "+str(baseEntropy)) 
     print("newEntropy : " + str(newEntropy)) 
     print("infoGain : " + str(infoGain)) 
     """ 
     if (infoGain > bestInfoGain): # compare this to the best gain so far 
      bestInfoGain = infoGain # if better than current best, set to best 
      bestFeature = i 
    return bestFeature # returns an integer 


def majorityCnt(classList): 
    classCount = {} 
    for vote in classList: 
     if vote not in classCount.keys(): classCount[vote] = 0 
     classCount[vote] += 1 
    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) 
    return sortedClassCount[0][0] 


def createTree(dataSet, labels): 
    # extracting data 
    classList = [example[-1] for example in dataSet] 
    if classList.count(classList[0]) == len(classList): 
     return classList[0] # stop splitting when all of the classes are equal 
    if len(dataSet[0]) == 1: # stop splitting when there are no more features in dataSet 
     return majorityCnt(classList) 
    # use Information Gain 
    bestFeat = chooseBestFeatureToSplit(dataSet) 
    bestFeatLabel = labels[bestFeat] 

    #build a tree recursively 
    myTree = {bestFeatLabel: {}} 
    #print("myTree : "+labels[bestFeat]) 
    del (labels[bestFeat]) 
    featValues = [example[bestFeat] for example in dataSet] 
    #print("featValues: "+str(featValues)) 
    uniqueVals = set(featValues) 
    #print("uniqueVals: " + str(uniqueVals)) 
    for value in uniqueVals: 
     subLabels = labels[:] # copy all of labels, so trees don't mess up existing labels 
     #print("subLabels"+str(subLabels)) 
     myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels) 
     #print("myTree : " + str(myTree)) 
    return myTree 


def classify(inputTree, featLabels, testVec): 
    firstStr = list(inputTree)[0] #print("fistStr : "+firstStr) 
    secondDict = inputTree[firstStr] 
    #print("secondDict : " + str(secondDict)) 
    featIndex = featLabels.index(firstStr) 
    #print("featIndex : " + str(featIndex)) 
    key = testVec[featIndex] 
    #print("key : " + str(key)) 
    valueOfFeat = secondDict[key] 
    #print("valueOfFeat : " + str(valueOfFeat)) 
    if isinstance(valueOfFeat, dict): 
     #print("is instance: "+str(valueOfFeat)) 
     classLabel = classify(valueOfFeat, featLabels, testVec) 
    else: 
     #print("is Not instance: " + valueOfFeat) 
     classLabel = valueOfFeat 
    return classLabel 


def storeTree(inputTree, filename): 
    import pickle 
    fw = open(filename, 'w') 
    pickle.dump(inputTree, fw) 
    fw.close() 


def grabTree(filename): 
    import pickle 
    fr = open(filename) 
    return pickle.load(fr) 

def accuracy_metric(actual, predicted): 
    correct = 0 
    for i in range(len(actual)): 
     if actual[i] == predicted[i]: 
      correct += 1 
    return correct/float(len(actual)) * 100.0 

# collect data 
myDat, labels = load_csv('data/basketball.train.csv') 
#print(myDat) 
#build a tree 
mytree = createTree(myDat, labels) 
#print(mytree) 

#run test 

predictions=[] 
for row in myDat: 
    prediction = classify(mytree, ["location","w","final_margin","shot_number","period","game_clock","shot_clock","dribbles","touch_time", 
      "shot_dist","pts_type","close_def_dist"], [row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8], 
                 row[9],row[10],row[11]]) 
    #print('Expected=%s, Got=%s' % (row[-1], prediction)) 
    predictions.append(prediction) 
actual = [row[-1] for row in myDat] 
accuracy = accuracy_metric(actual, predictions) 
print(accuracy) 
+2

귀하의 게시물을 고의적으로 파괴하지 마십시오 예를 들어, 코드의 마지막 부분을 교체. 스택 교환 네트워크에 게시함으로써 SE가 해당 콘텐츠를 배포 할 수있는 취소 할 수있는 권한을 부여한 것입니다 ([CC BY-SA 3.0 라이센스] (https://creativecommons.org/licenses/by-sa/3.0 /)). SE 정책에 따라 모든 파손 행위가 되돌아갑니다. 이 게시물의 계정 연결을 해제하려면 [해체 요청의 올바른 경로는 무엇입니까?] (https://meta.stackoverflow.com/q/323395)를 참조하십시오. – adiga

답변

2

당신은 당신의 데이터 집합을 분할 할 나타나지 않습니다 다음은 데이터 세트의 조각 (원래는 000 (74)를 통해 행이)입니다 별도의 교육 및 테스트 데이터 세트로 결과적으로 분류 기준이 데이터 집합에 너무 적합하므로 데이터 집합 외부의 샘플에서는 제대로 작동하지 않을 수 있습니다.

무작위로 훈련 용 데이터의 75 %를 선택하고 나머지 25 %로 정확도를 테스트하십시오.

import random 

dataset, labels = load_csv('data/basketball.train.csv') 
random.shuffle(dataset) 
split_index = int(len(dataset) * 0.75) 

train_dataset = dataset[:split_index] 
test_dataset = dataset[split_index:] 

mytree = createTree(train_dataset, labels) 

predictions=[] 
for row in test_dataset: 
    prediction = classify(mytree, ["location","w","final_margin","shot_number","period","game_clock","shot_clock","dribbles","touch_time", 
      "shot_dist","pts_type","close_def_dist"], [row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8], 
                 row[9],row[10],row[11]]) 
    #print('Expected=%s, Got=%s' % (row[-1], prediction)) 
    predictions.append(prediction) 
actual = [row[-1] for row in test_dataset] 
accuracy = accuracy_metric(actual, predictions) 
print(accuracy) 

( : 검증되지 않은)