문제가 데이터 세트를 분리하는 방식인지 또는 내가 잘못하고 있는지 알 수 없지만 프로그램을 실행할 때마다 정확도가 달라집니다. 아무도 내가 문제를 찾을 수 있도록 도와 줄 수 있습니까? 주셔서 감사합니다 여기 내 코드입니다 :코드를 수정하지 않고 다른 정확도 얻기
여기import pandas as pd
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
# load the data
from sklearn.tree import DecisionTreeClassifier
# url = "data/lung-cancer.data"
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/lung- cancer/lung-cancer.data"
data_set = pd.read_csv(url)
def clean_data(data_set):
# replace the ? with NaN
data_set = data_set.convert_objects(convert_numeric=True)
# replace the NaN with the average of the row
data_set = data_set.fillna(data_set.mean(axis=0), axis=0)
return data_set
data_set = clean_data(data_set)
def split_data(data_set):
# split the data in two parts train(80%), test(20%)
train, test = train_test_split(data_set.values, test_size=0.2)
# first column of the data are labels
labels_test = test[:, :1]
labels_train = train[:, :1]
# the rest of the columns are features
features_test = test[:, 1:]
features_train = train[:, 1:]
return features_train, labels_train, features_test, labels_test
features_train, labels_train, features_test, labels_test = split_data(data_set)
"""
print(labels_train)
print(features_train)
print(features_test)
print(labels_test)
"""
# Modeling step Test different algorithms
random_state = 2
classifiers = [
GaussianNB(),
KNeighborsClassifier(n_neighbors=3),
KNeighborsClassifier(n_neighbors=5),
SVC(kernel="poly", C=0.4, probability=True),
DecisionTreeClassifier(random_state=3),
RandomForestClassifier(random_state=3),
AdaBoostClassifier(random_state=3),
ExtraTreesClassifier(random_state=3),
GradientBoostingClassifier(random_state=3),
MLPClassifier(random_state=random_state)
]
accuracy_res = []
algorithm_res = []
for clf in classifiers:
clf.fit(features_train, labels_train)
name = clf.__class__.__name__
train_predictions = clf.predict(features_test)
accuracy = accuracy_score(labels_test, train_predictions)
print(name, "{:.4%}".format(accuracy))
accuracy_res.append(accuracy)
algorithm_res.append(name)
print()
y_pos = np.arange(len(algorithm_res))
plt.barh(y_pos, accuracy_res, align='center', alpha=0.5)
plt.yticks(y_pos, algorithm_res)
plt.xlabel('Accuracy')
plt.title('Algorithms')
plt.show()
내가지고있어 결과입니다 : 첫 번째 결과
GaussianNB 28.5714%
KNeighborsClassifier 57.1429%
KNeighborsClassifier 71.4286%
SVC 57.1429%
DecisionTreeClassifier 57.1429%
RandomForestClassifier 42.8571%
AdaBoostClassifier 42.8571%
ExtraTreesClassifier 42.8571%
GradientBoostingClassifier 57.1429%
MLPClassifier 57.1429%
두 번째 결과
GaussianNB 28.5714%
KNeighborsClassifier 42.8571%
KNeighborsClassifier 28.5714%
SVC 57.1429%
DecisionTreeClassifier 28.5714%
RandomForestClassifier 57.1429%
AdaBoostClassifier 57.1429%
ExtraTreesClassifier 42.8571%
GradientBoostingClassifier 28.5714%
MLPClassifier 57.1429%
셋째 결과
GaussianNB 71.4286%
KNeighborsClassifier 71.4286%
KNeighborsClassifier 71.4286%
SVC 28.5714%
DecisionTreeClassifier 28.5714%
RandomForestClassifier 57.1429%
AdaBoostClassifier 71.4286%
ExtraTreesClassifier 57.1429%
GradientBoostingClassifier 28.5714%
MLPClassifier 28.5714%
제발 어떻게 저를 바꾸라고 제안합니까? 이 작업을 수행하는 가장 좋은 방법은 무엇입니까? –
데이터를 먼저 나눌 수있는 함수를 작성하십시오 –
고맙습니다. –