2016-09-28 8 views
-1

트위터에서 감상적 분석을 디자인하려고합니다. NLTK 튜토리얼하지만를 사용하여 데이터 코드plaintext.split (' n')에서 줄을 찾는 중 : UnicodeDecodeError : 'ascii'코덱이 위치 2의 바이트 0x96을 디코딩 할 수 없습니다. 서수가 범위 내에 없습니다 (128)

import pickle 
import random 

import nltk 
from nltk import pos_tag 
from nltk.classify import ClassifierI 
from nltk.classify.scikitlearn import SklearnClassifier 
from nltk.tokenize import word_tokenize 
from sklearn.linear_model import LogisticRegression, SGDClassifier 
from sklearn.naive_bayes import MultinomialNB, BernoulliNB 
from sklearn.svm import LinearSVC 
from statistics import mode 


class VoteClassifier(ClassifierI): 
    def __init__(self, *classifiers): 
     self._classifiers = classifiers 

    def classify(self, features): 
     votes = [] 
     for c in self._classifiers: 
      v = c.classify(features) 
      votes.append(v) 
     return mode(votes) 

    def confidence(self, features): 
     votes = [] 
     for c in self._classifiers: 
      v = c.classify(features) 
      votes.append(v) 

     choice_votes = votes.count(mode(votes)) 
     conf = choice_votes/len(votes) 
     return conf 


short_pos = open("positive.txt", "r").read() 

short_neg = open("negative.txt", "r").read() 

# move this up here 
all_words = [] 
documents = [] 

# j is adject, r is adverb, and v is verb 
# allowed_word_types = ["J","R","V"] 
allowed_word_types = ["J"] 

for p in short_pos.split('\n'): 
    documents.append((p, "pos")) 
    words = word_tokenize(p) 
    pos = pos_tag(words) 
    for w in pos: 
     if w[1][0] in allowed_word_types: 
      all_words.append(w[0].lower()) 

for p in short_neg.split('\n'): 
    documents.append((p, "neg")) 
    words = word_tokenize(p) 
    pos = pos_tag(words) 
    for w in pos: 
     if w[1][0] in allowed_word_types: 
      all_words.append(w[0].lower()) 

save_documents = open("pickled_algos/documents.pickle", "wb") 
pickle.dump(documents, save_documents) 
save_documents.close() 

all_words = nltk.FreqDist(all_words) 

word_features = list(all_words.keys())[:5000] 

save_word_features = open("pickled_algos/word_features5k.pickle", "wb") 
pickle.dump(word_features, save_word_features) 
save_word_features.close() 


def find_features(document): 
    words = word_tokenize(document) 
    features = {} 
    for w in word_features: 
     features[w] = (w in words) 

    return features 


featuresets = [(find_features(rev), category) for (rev, category) in documents] 

random.shuffle(featuresets) 
print(len(featuresets)) 

testing_set = featuresets[10000:] 
training_set = featuresets[:10000] 

classifier = nltk.NaiveBayesClassifier.train(training_set) 
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set)) * 100) 
classifier.show_most_informative_features(15) 

############### 
save_classifier = open("pickled_algos/originalnaivebayes5k.pickle", "wb") 
pickle.dump(classifier, save_classifier) 
save_classifier.close() 

MNB_classifier = SklearnClassifier(MultinomialNB()) 
MNB_classifier.train(training_set) 
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set)) * 100) 

save_classifier = open("pickled_algos/MNB_classifier5k.pickle", "wb") 
pickle.dump(MNB_classifier, save_classifier) 
save_classifier.close() 

BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) 
BernoulliNB_classifier.train(training_set) 
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set)) * 100) 

save_classifier = open("pickled_algos/BernoulliNB_classifier5k.pickle", "wb") 
pickle.dump(BernoulliNB_classifier, save_classifier) 
save_classifier.close() 

LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) 
LogisticRegression_classifier.train(training_set) 
print("LogisticRegression_classifier accuracy percent:", 
     (nltk.classify.accuracy(LogisticRegression_classifier, testing_set)) * 100) 

save_classifier = open("pickled_algos/LogisticRegression_classifier5k.pickle", "wb") 
pickle.dump(LogisticRegression_classifier, save_classifier) 
save_classifier.close() 

LinearSVC_classifier = SklearnClassifier(LinearSVC()) 
LinearSVC_classifier.train(training_set) 
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) * 100) 

save_classifier = open("pickled_algos/LinearSVC_classifier5k.pickle", "wb") 
pickle.dump(LinearSVC_classifier, save_classifier) 
save_classifier.close() 

# NuSVC_classifier = SklearnClassifier(NuSVC()) 
# NuSVC_classifier.train(training_set) 
# print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100) 

SGDC_classifier = SklearnClassifier(SGDClassifier()) 
SGDC_classifier.train(training_set) 
print("SGDClassifier accuracy percent:", nltk.classify.accuracy(SGDC_classifier, testing_set) * 100) 

save_classifier = open("pickled_algos/SGDC_classifier5k.pickle", "wb") 
pickle.dump(SGDC_classifier, save_classifier) 
save_classifier.close() 
+0

읽기가 너무 오래! –

+0

[How to ask] (http://stackoverflow.com/help/how-to-ask)를 참조하십시오. – user8

+0

다음을보십시오. http://stackoverflow.com/a/35444608/610569. 나는 이것이 nltk라고 생각하지 않는다.) – alvas

답변

0

열기 다음 실행 할 수없는 권리 인코딩 텍스트 모드에서 파일, 예를 들면 :

with io.open("positive.txt", "r", encoding="UTF8") as fd: 
    short_pos = fd.read() 
+0

트레이스 백 (최근 호출 마지막) : 파일 "C : \ Python27 \ picklesave.py", 라인 44, short_pos = fd.read() 디코드시 (결과, 소비) = self._buffer_decode (데이터, 자체 오류, 최종)UnicodeDecodeError : 'utf8'코덱에서 디코딩 할 수없는 파일 "C : \ Python27 \ lib \ codecs.py" 4645 위치의 0xf3 바이트 : 연속 연속 바이트가 유효하지 않습니다. –

+1

파일 인코딩은 어떻게됩니까? –