2016-09-19 4 views
2

R에서 wordnet 라이브러리를 사용했고 아래에 제가 사용했던 코드 인 코퍼스에 대한 보조 정리를 사용할 수 있습니다.R에있는 wordnet에서 얻은 자료에서 보조 정리를 대체하는 방법

library(tm) 

doc1 <- "Stray cats are running all over the place. I see 10 a day!" 
doc2 <- "Cats are killers. They kill billions of animals a year." 
doc3 <- "The best food in Columbus, OH is the North Market." 
doc4 <- "Brand A is the best tasting cat food around. Your cat will love it." 
doc5 <- "Buy Brand C cat food for your cat. Brand C makes healthy and happy cats." 
doc6 <- "The Arnold Classic came to town this weekend. It reminds us to be healthy." 
doc7 <- "I have nothing to say. In summary, I have told you nothing." 


doc.list <- list(doc1, doc2, doc3, doc4, doc5, doc6, doc7) 

N.docs <- length(doc.list) 
names(doc.list) <- paste0("doc", c(1:N.docs)) 

query <- "Healthy cat food" 

my.docs <- VectorSource(c(doc.list, query)) 
my.docs$Names <- c(names(doc.list), "query") 

my.corpus <- Corpus(my.docs) 
my.corpus 

my.corpus <-tm_map(my.corpus,content_transformer(tolower)) 

#remove potentially problematic symbols 
toSpace <- content_transformer(function(x, pattern) { return (gsub(pattern, " ", x))}) 
removeSpecialChars <- function(x) gsub("[^a-zA-Z0-9 ]","",x) 
my.corpus <- tm_map(my.corpus, toSpace, "/") 
my.corpus <- tm_map(my.corpus, toSpace, "-") 
my.corpus <- tm_map(my.corpus, toSpace, ":") 
my.corpus <- tm_map(my.corpus, toSpace, ";") 
my.corpus <- tm_map(my.corpus, toSpace, "@") 
my.corpus <- tm_map(my.corpus, toSpace, "\\(") 
my.corpus <- tm_map(my.corpus, toSpace, ")") 
my.corpus <- tm_map(my.corpus, toSpace, ",") 
my.corpus <- tm_map(my.corpus, toSpace, "_") 
my.corpus <- tm_map(my.corpus, content_transformer(removeSpecialChars)) 
my.corpus <- tm_map(my.corpus, content_transformer(tolower)) 
my.corpus <- tm_map(my.corpus, removeWords, stopwords("en")) 
my.corpus <- tm_map(my.corpus, removePunctuation) 
my.corpus <- tm_map(my.corpus, stripWhitespace) 
my.corpus <- tm_map(my.corpus, removeNumbers) 
my.corpus <- tm_map(my.corpus, removeWords, c("status","please","need","mail", 
               "email","unable","re","fw","st","th","sep","nov","thank","kmmvlkm","prb")) 

#Stem document 
my.corpus <- tm_map(my.corpus,stemDocument) 

library(wordnet) 
setDict("C:/Program Files (x86)/WordNet/2.1/dict") 
initDict("C:/Program Files (x86)/WordNet/2.1/dict") 
lapply(my.corpus,function(x){ 
    sapply(unlist(strsplit(as.character(x),"[[:space:]]+")), function(word) { 
    x.filter <- getTermFilter("StartsWithFilter", word, TRUE) 
    terms <- getIndexTerms("NOUN",1,x.filter) 
    if(!is.null(terms)) sapply(terms,getLemma) 
    }) 
}) 

는 지금은 보조 정리는 사람이 솔루션을 얻는 방법을 알고 있다면, 그것은 큰 도움이 될 것입니다 어떤 공유시겠습니까, 워드 넷 라이브러리를 사용하여 게시물을 받았습니다과 코퍼스에서 단어를 교체합니다.

답변

0

이 하나

**Output** <- lapply(my.corpus,function(x){ 
    sapply(unlist(strsplit(as.character(x),"[[:space:]]+")), function(word) { 
     x.filter <- getTermFilter("StartsWithFilter", word, TRUE) 
     terms <- getIndexTerms("NOUN",1,x.filter) 
     if(!is.null(terms)) sapply(terms,getLemma) 
     }) 
    }) 

출력이 목록해야보십시오. 해당 목록을 자료로 변환하십시오.

cprs <- as.VCorpus(Output)  

그리고 나서 crps를 dtm으로 변환하십시오. 도움이 되었기를 바랍니다.