基于lucene 4.3.0
MoreLikeThis 的功能是,指定document,指定document中进行匹配的fieldName,根据这些fieldName中的term vector,产生一个查询对象(Query),具体的代码如下
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
public static void main(String[] args) throws Exception { Directory directory = new RAMDirectory(); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_43, analyzer); IndexWriter writer = new IndexWriter(directory, conf); // Field选项,让其存储term vector加快like分析速度, // 否则需要在执行mlt.like(0)时动态的生成term vector FieldType TYPE_STORED = new FieldType(); TYPE_STORED.setIndexed(true); TYPE_STORED.setTokenized(true); TYPE_STORED.setStored(true); TYPE_STORED.setStoreTermVectors(true); TYPE_STORED.freeze(); String[] docs = { "JQuery in Action", "Lucene in Action", "Sprint in Action", "Thinking in Java" }; for (int i = 0; i < docs.length; i++) { Document d = new Document(); d.add(new Field("content", docs[i], TYPE_STORED)); writer.addDocument(d); } writer.close(); // ----------------向目录对象中添加document------------------ IndexReader reader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); MoreLikeThis mlt = new MoreLikeThis(reader); //如果存储中已经提供term vector,可以不用提供分析器啦, //分析器的作用也就产生term vector mlt.setAnalyzer(analyzer); //不设置的话会使用名为"contens"的fieldName mlt.setFieldNames(new String[] { "content" }); //term在源document中出现给定的次数才是一个有效的term mlt.setMinTermFreq(1); //一个term至少要在给定的document中出现,查看源码,MoreLikeThis.createQueue中, //这个值和IndexReader.docFreq(Term term)返回值进行比较的 mlt.setMinDocFreq(1); Query query = mlt.like(0); TopDocs topDocs = searcher.search(query, 10); for (ScoreDoc doc : topDocs.scoreDocs) { System.out.println(doc.doc); Document dlike = reader.document(doc.doc); System.out.println(dlike.getField("content").stringValue()); } } |
本博客有javacoder.cn整理
Posted in: Lucene
Comments are closed.