필자의 POS 코퍼스로 Lingpipe의 HMM 구현을 성공적으로 평가했습니다 (정확도 90 % 이상).HMM MODEL을 Java에서 FILE로 직렬화/컴파일
내 자신의 POS 코퍼스에 POS의 HMM 평가를위한 개미 파일은 브라운 POS 코퍼스와 같습니다
<target name="eval-brown"
depends="compile">
<java classname="EvaluatePos"
fork="true"
maxMemory="512M">
<jvmarg value="-server"/>
<classpath refid="classpath.standard"/>
<arg value="1"/> <!-- sent eval rate -->
<arg value="50000"/> <!-- toks before eval -->
<arg value="10"/> <!-- max n-best -->
<arg value="8"/> <!-- n-gram size -->
<arg value="128"/> <!-- num characters -->
<arg value="8.0"/> <!-- interpolation ratio -->
<arg value="BrownPosCorpus"/> <!-- corpus implementation class -->
<arg value="${data.pos.brown}"/> <!-- baseline for data -->
<arg value="true"/> <!-- smoothe tags -->
</java>
</target>
은닉 마르코프 POS 술래를 평가하는 클래스는 다음과 같이 EvaluatePos.java이 주어진다.
public class EvaluatePos {
final int mSentEvalRate;
final int mToksBeforeEval;
final int mMaxNBest;
final int mNGram;
final int mNumChars;
final double mLambdaFactor;
final PosCorpus mCorpus;
final Set<String> mTagSet = new HashSet<String>();
HmmCharLmEstimator mEstimator;
TaggerEvaluator<String> mTaggerEvaluator;
NBestTaggerEvaluator<String> mNBestTaggerEvaluator;
MarginalTaggerEvaluator<String> mMarginalTaggerEvaluator;
int mTrainingSentenceCount = 0;
int mTrainingTokenCount = 0;
public EvaluatePos(String[] args) throws Exception {
mSentEvalRate = Integer.valueOf(args[0]);
mToksBeforeEval = Integer.valueOf(args[1]);
mMaxNBest = Integer.valueOf(args[2]);
mNGram = Integer.valueOf(args[3]);
mNumChars = Integer.valueOf(args[4]);
mLambdaFactor = Double.valueOf(args[5]);
String constructorName = args[6];
File corpusFile = new File(args[7]);
Object[] consArgs = new Object[] { corpusFile };
@SuppressWarnings("rawtypes") // req 2 step
PosCorpus corpus
= (PosCorpus)
Class
.forName(constructorName)
.getConstructor(new Class[] { File.class })
.newInstance(consArgs);
mCorpus = corpus;
}
void run() throws IOException {
System.out.println("\nCOMMAND PARAMETERS:");
System.out.println(" Sent eval rate=" + mSentEvalRate);
System.out.println(" Toks before eval=" + mToksBeforeEval);
System.out.println(" Max n-best eval=" + mMaxNBest);
System.out.println(" Max n-gram=" + mNGram);
System.out.println(" Num chars=" + mNumChars);
System.out.println(" Lambda factor=" + mLambdaFactor);
CorpusProfileHandler profileHandler = new CorpusProfileHandler();
parseCorpus(profileHandler);
String[] tags = mTagSet.toArray(Strings.EMPTY_STRING_ARRAY);
Arrays.sort(tags);
Set<String> tagSet = new HashSet<String>();
for (String tag : tags)
tagSet.add(tag);
System.out.println("\nCORPUS PROFILE:");
System.out.println(" Corpus class=" + mCorpus.getClass().getName());
System.out.println(" #Sentences="
+ mTrainingSentenceCount);
System.out.println(" #Tokens=" + mTrainingTokenCount);
System.out.println(" #Tags=" + tags.length);
System.out.println(" Tags=" + Arrays.asList(tags));
System.out.println("\nEVALUATION:");
mEstimator
= new HmmCharLmEstimator(mNGram,mNumChars,mLambdaFactor);
for (int i = 0; i < tags.length; ++i)
mEstimator.addState(tags[i]);
HmmDecoder decoder
= new HmmDecoder(mEstimator); // no caching
boolean storeTokens = true;
mTaggerEvaluator
= new TaggerEvaluator<String>(decoder,storeTokens);
mNBestTaggerEvaluator
= new NBestTaggerEvaluator<String>(decoder,mMaxNBest,mMaxNBest);
mMarginalTaggerEvaluator
= new MarginalTaggerEvaluator<String>(decoder,tagSet,storeTokens);
LearningCurveHandler evaluationHandler
= new LearningCurveHandler();
parseCorpus(evaluationHandler);
System.out.println("\n\n\nFINAL REPORT");
System.out.println("\n\nFirst Best Evaluation");
System.out.println(mTaggerEvaluator.tokenEval());
System.out.println("\n\nN Best Evaluation");
System.out.println(mNBestTaggerEvaluator.nBestHistogram());
}
void parseCorpus(ObjectHandler<Tagging<String>> handler) throws IOException {
Parser<ObjectHandler<Tagging<String>>> parser = mCorpus.parser();
parser.setHandler(handler);
Iterator<InputSource> it = mCorpus.sourceIterator();
while (it.hasNext()) {
InputSource in = it.next();
parser.parse(in);
}
}
class CorpusProfileHandler implements ObjectHandler<Tagging<String>> {
public void handle(Tagging<String> tagging) {
++mTrainingSentenceCount;
mTrainingTokenCount += tagging.size();
for (int i = 0; i < tagging.size(); ++i)
mTagSet.add(tagging.tag(i));
}
}
class LearningCurveHandler implements ObjectHandler<Tagging<String>> {
Set<String> mKnownTokenSet = new HashSet<String>();
int mUnknownTokensTotal = 0;
int mUnknownTokensCorrect = 0;
public void handle(Tagging<String> tagging) {
if (mEstimator.numTrainingTokens() > mToksBeforeEval
&& mEstimator.numTrainingCases() % mSentEvalRate == 0) {
mTaggerEvaluator.handle(tagging);
mNBestTaggerEvaluator.handle(tagging);
mMarginalTaggerEvaluator.handle(tagging);
System.out.println("\nTest Case "
+ mTaggerEvaluator.numCases());
System.out.println("First Best Last Case Report");
System.out.println(mTaggerEvaluator.lastCaseToString(mKnownTokenSet));
System.out.println("N-Best Last Case Report");
System.out.println(mNBestTaggerEvaluator.lastCaseToString(5));
System.out.println("Marginal Last Case Report");
System.out.println(mMarginalTaggerEvaluator.lastCaseToString(5));
System.out.println("Cumulative Evaluation");
System.out.print(" Estimator: #Train Cases="
+ mEstimator.numTrainingCases());
System.out.println(" #Train Toks="
+ mEstimator.numTrainingTokens());
ConfusionMatrix tokenEval = mTaggerEvaluator.tokenEval().confusionMatrix();
System.out.println(" First Best Accuracy (All Tokens) = "
+ tokenEval.totalCorrect()
+ "/" + tokenEval.totalCount()
+ " = " + tokenEval.totalAccuracy());
ConfusionMatrix unkTokenEval = mTaggerEvaluator.unknownTokenEval(mKnownTokenSet).confusionMatrix();
mUnknownTokensTotal += unkTokenEval.totalCount();
mUnknownTokensCorrect += unkTokenEval.totalCorrect();
System.out.println(" First Best Accuracy (Unknown Tokens) = "
+ mUnknownTokensCorrect
+ "/" + mUnknownTokensTotal
+ " = " + (mUnknownTokensCorrect/(double)mUnknownTokensTotal));
}
// train after eval
mEstimator.handle(tagging);
for (int i = 0; i < tagging.size(); ++i)
mKnownTokenSet.add(tagging.token(i));
}
}
public static void main(String[] args)
throws Exception {
new EvaluatePos(args).run();
}
}
내 질문은 어떻게 HMM 모델 파일을 만들어 체인 CRF 기반 NER의 기능으로 사용하는지입니다.
어떻게 Lingpipe ../../models 폴더의 pos-en-general-brown.HiddenMarkovModel이 생성 되었습니까?
나는 내가 볼때 흠 모델 파일을 만들려면 다음 코드를 삽입한다 BrownPosCorpus.java, BrownPosParser.java 및 EvaluatePos.java
을 사용하고 있습니다?
// write output to file
File modelFile = new File(args[1]);
AbstractExternalizable.compileTo(estimator,modelFile);
pos hmm 모델 파일을 만들려면 Ant 파일을 어떻게 변경할 수 있습니까?
나는 체인 CRF 기능 추출기의 기능으로 POS HMM 모델 파일을 사용하려면 :
...
static final File POS_HMM_FILE
= new File("../../models/pos-en-general-brown.HiddenMarkovModel");
...
안부를.