나는 내 데이터 세트의 마지막 열의 값을 '만든'또는 '놓친'것으로 예측하고 프로그램을 몇 번 실행했지만 정확도는 항상 100 %. 나는 그것이 약 95-100 % 일 것으로 기대한다. 왜 그런가? 여기의사 결정 나무의 정확도 분류 자 ​​


분류 코드입니다 :

from math import log 
import operator 

def load_csv(filename): 
    headers = ["location","w","final_margin","shot_number","period","game_clock","shot_clock","dribbles","touch_time", 
    df = pd.read_csv(filename, header=None, names=headers, na_values="?") 


    while i<len(dataset): 

    labels = ["location","w","final_margin","shot_number","period","game_clock","shot_clock", "dribbles","touch_time", 
    return new_list, labels 

def calcShannonEnt(dataSet): 
    numEntries = len(dataSet) 
    labelCounts = {} 
    for featVec in dataSet: # the the number of unique elements and their occurance 
     currentLabel = featVec[-1] 
     if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 
     labelCounts[currentLabel] += 1 
    shannonEnt = 0.0 
    for key in labelCounts: 
     prob = float(labelCounts[key])/numEntries 
     shannonEnt -= prob * log(prob, 2) # log base 2 
    return shannonEnt 

def splitDataSet(dataSet, axis, value): 
    retDataSet = [] 
    for featVec in dataSet: 
     if featVec[axis] == value: 
      reducedFeatVec = featVec[:axis] # chop out axis used for splitting 
      reducedFeatVec.extend(featVec[axis + 1:]) 
    return retDataSet 

def chooseBestFeatureToSplit(dataSet): 
    numFeatures = len(dataSet[0]) - 1 # the last column is used for the labels 
    baseEntropy = calcShannonEnt(dataSet) 
    bestInfoGain = 0.0; 
    bestFeature = -1 
    for i in range(numFeatures): # iterate over all the features 
     featList = [example[i] for example in dataSet] # create a list of all the examples of this feature 
     uniqueVals = set(featList) # get a set of unique values 
     newEntropy = 0.0 
     for value in uniqueVals: 
      subDataSet = splitDataSet(dataSet, i, value) 
      prob = len(subDataSet)/float(len(dataSet)) 
      newEntropy += prob * calcShannonEnt(subDataSet) 

     infoGain = baseEntropy - newEntropy # calculate the info gain; ie reduction in entropy 
     print("feature : " + str(i)) 
     print("baseEntropy : "+str(baseEntropy)) 
     print("newEntropy : " + str(newEntropy)) 
     print("infoGain : " + str(infoGain)) 
     if (infoGain > bestInfoGain): # compare this to the best gain so far 
      bestInfoGain = infoGain # if better than current best, set to best 
      bestFeature = i 
    return bestFeature # returns an integer 

def majorityCnt(classList): 
    classCount = {} 
    for vote in classList: 
     if vote not in classCount.keys(): classCount[vote] = 0 
     classCount[vote] += 1 
    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) 
    return sortedClassCount[0][0] 

def createTree(dataSet, labels): 
    # extracting data 
    classList = [example[-1] for example in dataSet] 
    if classList.count(classList[0]) == len(classList): 
     return classList[0] # stop splitting when all of the classes are equal 
    if len(dataSet[0]) == 1: # stop splitting when there are no more features in dataSet 
     return majorityCnt(classList) 
    # use Information Gain 
    bestFeat = chooseBestFeatureToSplit(dataSet) 
    bestFeatLabel = labels[bestFeat] 

    #build a tree recursively 
    myTree = {bestFeatLabel: {}} 
    #print("myTree : "+labels[bestFeat]) 
    del (labels[bestFeat]) 
    featValues = [example[bestFeat] for example in dataSet] 
    #print("featValues: "+str(featValues)) 
    uniqueVals = set(featValues) 
    #print("uniqueVals: " + str(uniqueVals)) 
    for value in uniqueVals: 
     subLabels = labels[:] # copy all of labels, so trees don't mess up existing labels 
     myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels) 
     #print("myTree : " + str(myTree)) 
    return myTree 

def classify(inputTree, featLabels, testVec): 
    firstStr = list(inputTree)[0] #print("fistStr : "+firstStr) 
    secondDict = inputTree[firstStr] 
    #print("secondDict : " + str(secondDict)) 
    featIndex = featLabels.index(firstStr) 
    #print("featIndex : " + str(featIndex)) 
    key = testVec[featIndex] 
    #print("key : " + str(key)) 
    valueOfFeat = secondDict[key] 
    #print("valueOfFeat : " + str(valueOfFeat)) 
    if isinstance(valueOfFeat, dict): 
     #print("is instance: "+str(valueOfFeat)) 
     classLabel = classify(valueOfFeat, featLabels, testVec) 
     #print("is Not instance: " + valueOfFeat) 
     classLabel = valueOfFeat 
    return classLabel 

def storeTree(inputTree, filename): 
    import pickle 
    fw = open(filename, 'w') 
    pickle.dump(inputTree, fw) 

def grabTree(filename): 
    import pickle 
    fr = open(filename) 
    return pickle.load(fr) 

def accuracy_metric(actual, predicted): 
    correct = 0 
    for i in range(len(actual)): 
     if actual[i] == predicted[i]: 
      correct += 1 
    return correct/float(len(actual)) * 100.0 

# collect data 
myDat, labels = load_csv('data/basketball.train.csv') 
#build a tree 
mytree = createTree(myDat, labels) 

#run test 

for row in myDat: 
    prediction = classify(mytree, ["location","w","final_margin","shot_number","period","game_clock","shot_clock","dribbles","touch_time", 
      "shot_dist","pts_type","close_def_dist"], [row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8], 
    #print('Expected=%s, Got=%s' % (row[-1], prediction)) 
actual = [row[-1] for row in myDat] 
accuracy = accuracy_metric(actual, predictions) 

당신은 당신의 데이터 집합을 분할 할 나타나지 않습니다 다음은 데이터 세트의 조각 (원래는 000 (74)를 통해 행이)입니다 별도의 교육 및 테스트 데이터 세트로 결과적으로 분류 기준이 데이터 집합에 너무 적합하므로 데이터 집합 외부의 샘플에서는 제대로 작동하지 않을 수 있습니다.

무작위로 훈련 용 데이터의 75 %를 선택하고 나머지 25 %로 정확도를 테스트하십시오.

import random 

dataset, labels = load_csv('data/basketball.train.csv') 
split_index = int(len(dataset) * 0.75) 

train_dataset = dataset[:split_index] 
test_dataset = dataset[split_index:] 

mytree = createTree(train_dataset, labels) 

for row in test_dataset: 
    prediction = classify(mytree, ["location","w","final_margin","shot_number","period","game_clock","shot_clock","dribbles","touch_time", 
      "shot_dist","pts_type","close_def_dist"], [row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8], 
    #print('Expected=%s, Got=%s' % (row[-1], prediction)) 
actual = [row[-1] for row in test_dataset] 
accuracy = accuracy_metric(actual, predictions) 

( : 검증되지 않은)