2017-09-19 6 views
-1
from nltk import word_tokenize, pos_tag 
from nltk.corpus import wordnet as wn 

def penn_to_wn(tag): 
    """ Convert between a Penn Treebank tag to a simplified Wordnet tag """ 
    if tag.startswith('N'): 
     return 'n' 

    if tag.startswith('V'): 
     return 'v' 

    if tag.startswith('J'): 
     return 'a' 

    if tag.startswith('R'): 
     return 'r' 

    return None 

def tagged_to_synset(word, tag): 
    wn_tag = penn_to_wn(tag) 
    if wn_tag is None: 
     return None 

    try: 
     return wn.synsets(word, wn_tag)[0] 
    except: 
     return None 

def sentence_similarity(sentence1, sentence2): 
    """ compute the sentence similarity using Wordnet """ 
    # Tokenize and tag 
    sentence1 = pos_tag(word_tokenize(sentence1)) 
    sentence2 = pos_tag(word_tokenize(sentence2)) 

    # Get the synsets for the tagged words 
    synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1] 
    synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2] 

    # Filter out the Nones 
    synsets1 = [ss for ss in synsets1 if ss] 
    synsets2 = [ss for ss in synsets2 if ss] 

    score, count = 0.0, 0 

    # For each word in the first sentence 
    for synset in synsets1: 
     # Get the similarity value of the most similar word in the other sentence 

      **best_score = max([(synset.path_similarity(ss)) for ss in synsets2])** 


     # Check that the similarity could have been computed 
    if best_score is not None: 
      score += best_score 
      count += 1 

    # Average the values 
    score /= count 
    return score 

if __name__ == '__main__': 
sentences = [ 
    'Password should not be less than 8 characters.', 
    'The user should enter valid user name and password.', 
    'User name should not have special characters.', 
    'Datta passed out from IIT', 
] 

focus_sentence = 'The user should enter valid user name and password and password should have greater than or equal to 8 characters.' 
for sentence in sentences: 
    print(sentence_similarity(focus_sentence, sentence)) 
+0

'synset.path_similarity (ss)'는 때때로'None'이며'max()'호출이 실패하기 때문에 오류 메시지는 분명합니다. –

+0

그렇다면 어떻게해야합니까? – Dattatreya

답변

1

아래 코드의 대담한에있는 기능 path_similarity()None를 반환 할 수 있다는 것입니다 다음 max() 호출이 실패 . 이 사건이 언제 발생하는지 확인하는 문제입니다. 가능한 해결책은 path_similarity()에서 None 값을 제외한 simlist 목록을 작성하는 것입니다. simlist이 비어 있으면 현재 반복을 건너 뛰고, 그렇지 않으면 max()를 호출하고 나머지 반복을 계속합니다.

# For each word in the first sentence 
for synset in synsets1: 
    # Get the similarity value of the most similar word in the other sentence 
    simlist = [synset.path_similarity(ss) for ss in synsets2 if synset.path_similarity(ss) is not None] 
    if not simlist: 
     continue; 
    best_score = max(simlist) 

    # Check that the similarity could have been computed 
    score += best_score 
    count += 1 

if count == 0: 
return 0 

# Average the values 
score /= count 
return score