2017-11-16 2 views
0

Lucene에서 용어 - 문서 행렬을 얻으려고합니다. SO 질문의 대부분은 다른 클래스를 가진 오래된 API를위한 것 같습니다.Lucene의 용어 - 문서 행렬

관련 코드를하지만, DocEnum 현재 API에서 인식되지 않는 : 나는 모든 문서에서 용어 벡터를 얻기 위해이 두 가지 질문에서 통찰력을 결합했습니다. 모든 문서에 대해 용어 벡터 또는 모든 용어의 수를 얻으려면 어떻게해야합니까?

IndexReader reader = DirectoryReader.open(index); 

for (int i = 0; i < reader.maxDoc(); i++) { 
    Document doc = reader.document(i); 
    Terms terms = reader.getTermVector(i, "country_text"); 

    if (terms != null && terms.size() > 0) { 
     // access the terms for this field 
     TermsEnum termsEnum = terms.iterator(); 
     BytesRef term = null; 

     // explore the terms for this field 
     while ((term = termsEnum.next()) != null) { 
      // enumerate through documents, in this case only one 
      DocsEnum docsEnum = termsEnum.docs(null, null); 
      int docIdEnum; 
      while ((docIdEnum = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { 
       // get the term frequency in the document 
       System.out.println(term.utf8ToString()+ " " + docIdEnum + " " + docsEnum.freq()); 
      } 
     } 
    } 
} 

전체 코드 :

import java.io.*; 
import java.util.Iterator; 

import org.apache.lucene.analysis.standard.StandardAnalyzer; 
import org.apache.lucene.document.Document; 
import org.apache.lucene.document.Field; 
import org.apache.lucene.document.StringField; 
import org.apache.lucene.document.TextField; 
import org.apache.lucene.index.DirectoryReader; 
import org.apache.lucene.index.IndexReader; 
import org.apache.lucene.index.IndexWriter; 
import org.apache.lucene.index.IndexWriterConfig; 
import org.apache.lucene.index.Term; 
import org.apache.lucene.index.Terms; 
import org.apache.lucene.index.TermsEnum; 
import org.apache.lucene.queryparser.classic.ParseException; 
import org.apache.lucene.queryparser.classic.QueryParser; 
import org.apache.lucene.search.BooleanClause; 
import org.apache.lucene.search.BooleanQuery; 
import org.apache.lucene.search.FuzzyQuery; 
import org.apache.lucene.search.IndexSearcher; 
import org.apache.lucene.search.Query; 
import org.apache.lucene.search.ScoreDoc; 
import org.apache.lucene.search.TermQuery; 
import org.apache.lucene.search.TopDocs; 
import org.apache.lucene.store.Directory; 
import org.apache.lucene.store.RAMDirectory; 
import org.apache.lucene.util.BytesRef; 
import org.json.simple.JSONArray; 
import org.json.simple.JSONObject; 
import org.json.simple.JSONValue; 
import org.json.simple.parser.JSONParser; 

public class LuceneIndex { 

    public static void main(String[] args) throws IOException, ParseException { 

     String jsonFilePath = "wiki_data.json"; 
     JSONParser parser = new JSONParser(); 
     // Specify the analyzer for tokenizing text. 
     StandardAnalyzer analyzer = new StandardAnalyzer(); 
     // create the index 
     Directory index = new RAMDirectory(); 
     IndexWriterConfig config = new IndexWriterConfig(analyzer); 
     IndexWriter w = new IndexWriter(index, config); 

     try {  
      JSONArray a = (JSONArray) parser.parse(new FileReader(jsonFilePath)); 

      for (Object o : a) { 
       JSONObject country = (JSONObject) o; 
       String countryName = (String) country.get("country_name"); 
       String cityName = (String) country.get("city_name"); 
       String countryText = (String) country.get("country_text"); 
       String cityText = (String) country.get("city_text"); 
       System.out.println(cityName); 
       addDoc(w, countryName, cityName, countryText, cityText); 
      } 
      w.close(); 

      IndexReader reader = DirectoryReader.open(index); 

      for (int i = 0; i < reader.maxDoc(); i++) { 
       Document doc = reader.document(i); 
       Terms terms = reader.getTermVector(i, "country_text"); 

       if (terms != null && terms.size() > 0) { 
        // access the terms for this field 
        TermsEnum termsEnum = terms.iterator(); 
        BytesRef term = null; 

        // explore the terms for this field 
        while ((term = termsEnum.next()) != null) { 
         // enumerate through documents, in this case only one 
         DocsEnum docsEnum = termsEnum.docs(null, null); 
         int docIdEnum; 
         while ((docIdEnum = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { 
          // get the term frequency in the document 
          System.out.println(term.utf8ToString()+ " " + docIdEnum + " " + docsEnum.freq()); 
         } 
        } 
       } 
      } 

      // reader can be closed when there 
      // is no need to access the documents any more. 
      reader.close(); 

     } catch (FileNotFoundException e) { 
      e.printStackTrace(); 
     } catch (IOException e) { 
      e.printStackTrace(); 
     } catch (org.json.simple.parser.ParseException e) { 
      e.printStackTrace(); 
     } 
    } 

    private static void addDoc(IndexWriter w, String countryName, String cityName, 
      String countryText, String cityText) throws IOException { 
     Document doc = new Document(); 
     doc.add(new StringField("country_name", countryName, Field.Store.YES)); 
     doc.add(new StringField("city_name", cityName, Field.Store.YES)); 
     doc.add(new TextField("country_text", countryText, Field.Store.YES)); 
     doc.add(new TextField("city_text", cityText, Field.Store.YES)); 

     w.addDocument(doc); 
    } 

} 

답변

0

question에 따르면 당신은 용어 주파수 텍스트 필드을 사용할 수 없습니다. 왜냐하면 그것을 계산하지 않기 때문입니다. "필드"을 사용하십시오.