단계를 워드 넷 경우에만 : 문서 수준> 문장을 -> 토큰 -> POS -> Lemmas
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
#example text text = 'What can I say about this place. The staff of these restaurants is nice and the eggplant is not bad'
class Splitter(object):
"""
split the document into sentences and tokenize each sentence
"""
def __init__(self):
self.splitter = nltk.data.load('tokenizers/punkt/english.pickle')
self.tokenizer = nltk.tokenize.TreebankWordTokenizer()
def split(self,text):
"""
out : ['What', 'can', 'I', 'say', 'about', 'this', 'place', '.']
"""
# split into single sentence
sentences = self.splitter.tokenize(text)
# tokenization in each sentences
tokens = [self.tokenizer.tokenize(sent) for sent in sentences]
return tokens
class LemmatizationWithPOSTagger(object):
def __init__(self):
pass
def get_wordnet_pos(self,treebank_tag):
"""
return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v)
"""
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
# As default pos in lemmatization is Noun
return wordnet.NOUN
def pos_tag(self,tokens):
# find the pos tagginf for each tokens [('What', 'WP'), ('can', 'MD'), ('I', 'PRP') ....
pos_tokens = [nltk.pos_tag(token) for token in tokens]
# lemmatization using pos tagg
# convert into feature set of [('What', 'What', ['WP']), ('can', 'can', ['MD']), ... ie [original WORD, Lemmatized word, POS tag]
pos_tokens = [ [(word, lemmatizer.lemmatize(word,self.get_wordnet_pos(pos_tag)), [pos_tag]) for (word,pos_tag) in pos] for pos in pos_tokens]
return pos_tokens
lemmatizer = WordNetLemmatizer()
splitter = Splitter()
lemmatization_using_pos_tagger = LemmatizationWithPOSTagger()
#step 1 split document into sentence followed by tokenization
tokens = splitter.split(text)
#step 2 lemmatization using pos tagger
lemma_pos_token = lemmatization_using_pos_tagger.pos_tag(tokens)
print(lemma_pos_token)
또한 위성 형용사 기억 =)'ADJ_SAT = 's'' http://wordnet.princeton.edu/wordnet/man/wngloss.7WN.html – alvas
''it''에 대한 pos 태그 'm loving it.' '문자열은'PRP '입니다.이 함수는 lemmatizer가 받아들이지 않는 빈 문자열을 반환하고'KeyError'를 던집니다. 이 경우 무엇을 할 수 있습니까? –
누구나 이것이 전체 문서를 처리 할 때 얼마나 효율적인지 알고 있습니까? – Ksofiac