2013-10-27 3 views
1

나는 긍정적이고 부정적인 문장이있다. 다른 문장에 대한 감정을 조사하기 위해 NaiveBayesClassifier를 교육하기 위해 Python NLTK를 사용하는 것이 매우 간단합니다.Python NLTK 감정이 올바르지 않다.

이 코드를 사용하려고하지만 결과는 항상 긍정적입니다. http://www.sjwhitworth.com/sentiment-analysis-in-python-using-nltk/

저는 파이썬에서 매우 새로운 부분이므로, 복사 할 때 코드에 실수가 있습니다.

import nltk 
import math 
import re 
import sys 
import os 
import codecs 
reload(sys) 
sys.setdefaultencoding('utf-8') 

from nltk.corpus import stopwords 

__location__ = os.path.realpath(
    os.path.join(os.getcwd(), os.path.dirname(__file__))) 

postweet = __location__ + "/postweet.txt" 
negtweet = __location__ + "/negtweet.txt" 


customstopwords = ['band', 'they', 'them'] 

#Load positive tweets into a list 
p = open(postweet, 'r') 
postxt = p.readlines() 

#Load negative tweets into a list 
n = open(negtweet, 'r') 
negtxt = n.readlines() 

neglist = [] 
poslist = [] 

#Create a list of 'negatives' with the exact length of our negative tweet list. 
for i in range(0,len(negtxt)): 
    neglist.append('negative') 

#Likewise for positive. 
for i in range(0,len(postxt)): 
    poslist.append('positive') 

#Creates a list of tuples, with sentiment tagged. 
postagged = zip(postxt, poslist) 
negtagged = zip(negtxt, neglist) 

#Combines all of the tagged tweets to one large list. 
taggedtweets = postagged + negtagged 

tweets = [] 

#Create a list of words in the tweet, within a tuple. 
for (word, sentiment) in taggedtweets: 
    word_filter = [i.lower() for i in word.split()] 
    tweets.append((word_filter, sentiment)) 

#Pull out all of the words in a list of tagged tweets, formatted in tuples. 
def getwords(tweets): 
    allwords = [] 
    for (words, sentiment) in tweets: 
     allwords.extend(words) 
    return allwords 

#Order a list of tweets by their frequency. 
def getwordfeatures(listoftweets): 
    #Print out wordfreq if you want to have a look at the individual counts of words. 
    wordfreq = nltk.FreqDist(listoftweets) 
    words = wordfreq.keys() 
    return words 

#Calls above functions - gives us list of the words in the tweets, ordered by freq. 
print getwordfeatures(getwords(tweets)) 

wordlist = [] 
wordlist = [i for i in wordlist if not i in stopwords.words('english')] 
wordlist = [i for i in wordlist if not i in customstopwords] 

def feature_extractor(doc): 
    docwords = set(doc) 
    features = {} 
    for i in wordlist: 
     features['contains(%s)' % i] = (i in docwords) 
    return features 

#Creates a training set - classifier learns distribution of true/falses in the input. 
training_set = nltk.classify.apply_features(feature_extractor, tweets) 
classifier = nltk.NaiveBayesClassifier.train(training_set) 

print classifier.show_most_informative_features(n=30) 

while True: 
    input = raw_input('ads') 
    if input == 'exit': 
     break 
    elif input == 'informfeatures': 
     print classifier.show_most_informative_features(n=30) 
     continue 
    else: 
     input = input.lower() 
     input = input.split() 
     print '\nWe think that the sentiment was ' + classifier.classify(feature_extractor(input)) + ' in that sentence.\n' 

p.close() 
n.close() 

이것은 단지 오류 코드입니까? 또는 무엇이 문제인가. 문제가 그것이 인쇄 classifier.show_most_informative_features(n=30)을 인쇄해야합니다 시작하지만 난이 얻는 결과는이 힌트를 줄 수없는 경우 대부분의 유익한 기능 없음

경우.

감사합니다.

답변

2

wordList가 비어 있습니다. getwordfeatures (getword (tweets))에 할당되어야합니다.

다음 두 행 :

단어 목록 = [I I 단어 목록에 대한 I하지 stopwords.words IN ('영어')의 경우] = [제가

단어 목록 맞춤법 단어가 아닌 단어 목록]

은 "둘 중 하나"또는 "모두"입니다. 어떤 스톱 워드 목록이 더 효과적인지 시험해 볼 수 있습니다.

3

NLTK를 사용하는 정서 분석에 관심이있는 모든 사람에게. 다음은 전체 작업 코드입니다. @NLPer에게 감사합니다.

import nltk 
import math 
import re 
import sys 
import os 
import codecs 
reload(sys) 
sys.setdefaultencoding('utf-8') 

from nltk.corpus import stopwords 

__location__ = os.path.realpath(
    os.path.join(os.getcwd(), os.path.dirname(__file__))) 

postweet = __location__ + "/postweet.txt" 
negtweet = __location__ + "/negtweet.txt" 


customstopwords = ['band', 'they', 'them'] 

#Load positive tweets into a list 
p = open(postweet, 'r') 
postxt = p.readlines() 

#Load negative tweets into a list 
n = open(negtweet, 'r') 
negtxt = n.readlines() 

neglist = [] 
poslist = [] 

#Create a list of 'negatives' with the exact length of our negative tweet list. 
for i in range(0,len(negtxt)): 
    neglist.append('negative') 

#Likewise for positive. 
for i in range(0,len(postxt)): 
    poslist.append('positive') 

#Creates a list of tuples, with sentiment tagged. 
postagged = zip(postxt, poslist) 
negtagged = zip(negtxt, neglist) 

#Combines all of the tagged tweets to one large list. 
taggedtweets = postagged + negtagged 

tweets = [] 

#Create a list of words in the tweet, within a tuple. 
for (word, sentiment) in taggedtweets: 
    word_filter = [i.lower() for i in word.split()] 
    tweets.append((word_filter, sentiment)) 

#Pull out all of the words in a list of tagged tweets, formatted in tuples. 
def getwords(tweets): 
    allwords = [] 
    for (words, sentiment) in tweets: 
     allwords.extend(words) 
    return allwords 

#Order a list of tweets by their frequency. 
def getwordfeatures(listoftweets): 
    #Print out wordfreq if you want to have a look at the individual counts of words. 
    wordfreq = nltk.FreqDist(listoftweets) 
    words = wordfreq.keys() 
    return words 

#Calls above functions - gives us list of the words in the tweets, ordered by freq. 
print getwordfeatures(getwords(tweets)) 

wordlist = getwordfeatures(getwords(tweets)) 
wordlist = [i for i in wordlist if not i in stopwords.words('english')] 
wordlist = [i for i in wordlist if not i in customstopwords] 

def feature_extractor(doc): 
    docwords = set(doc) 
    features = {} 
    for i in wordlist: 
     features['contains(%s)' % i] = (i in docwords) 
    return features 

#Creates a training set - classifier learns distribution of true/falses in the input. 
training_set = nltk.classify.apply_features(feature_extractor, tweets) 
classifier = nltk.NaiveBayesClassifier.train(training_set) 

print classifier.show_most_informative_features(n=30) 

while True: 
    input = raw_input('ads') 
    if input == 'exit': 
     break 
    elif input == 'informfeatures': 
     print classifier.show_most_informative_features(n=30) 
     continue 
    else: 
     input = input.lower() 
     input = input.split() 
     print '\nWe think that the sentiment was ' + classifier.classify(feature_extractor(input)) + ' in that sentence.\n' 

p.close() 
n.close()